From 3805da6ecd1e6467a589a83c5a636e233748db22 Mon Sep 17 00:00:00 2001
From: Sai Abhinay Anubola <saiabhinay.anubola@amd.com>
Date: Wed, 4 Dec 2024 13:41:41 +0530
Subject: [PATCH] Combine (shl (and x, imm1), imm2) to (shl x, imm2) where
 (~imm1 << imm2) = 0

---
 .../llvm/CodeGen/GlobalISel/CombinerHelper.h  |   5 +
 .../include/llvm/Target/GlobalISel/Combine.td |  11 +-
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  33 +
 .../AArch64/GlobalISel/combine-shl-and.mir    | 152 ++++
 .../CodeGen/AIE/aie2/bfloat16_to_float.ll     |   4 +-
 .../CodeGen/AMDGPU/GlobalISel/add.v2i16.ll    |  29 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll  |  10 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll   |  51 +-
 ...mbine-shl-from-extend-narrow.postlegal.mir |  40 +-
 ...ombine-shl-from-extend-narrow.prelegal.mir |  24 +-
 .../AMDGPU/GlobalISel/cvt_f32_ubyte.ll        |  14 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll   | 798 ++++++++----------
 llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll   | 653 +++++++-------
 .../AMDGPU/GlobalISel/insertelement.i8.ll     | 173 ++--
 .../llvm.amdgcn.image.load.1d.d16.ll          |  11 +-
 .../GlobalISel/llvm.amdgcn.intersect_ray.ll   |  52 +-
 .../llvm.amdgcn.raw.buffer.load.format.f16.ll |  42 +-
 ...m.amdgcn.raw.ptr.buffer.load.format.f16.ll |  42 +-
 .../llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll   |  24 +-
 .../llvm.amdgcn.raw.tbuffer.load.f16.ll       |  24 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll    |  24 +-
 ...vm.amdgcn.struct.buffer.load.format.f16.ll |  42 +-
 ...mdgcn.struct.ptr.buffer.load.format.f16.ll |  42 +-
 ...llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll |  42 +-
 .../llvm.amdgcn.struct.tbuffer.load.f16.ll    |  42 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll    |  42 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll   |  10 +-
 .../test/CodeGen/AMDGPU/GlobalISel/saddsat.ll | 115 +--
 .../test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll | 115 ++-
 .../CodeGen/AMDGPU/GlobalISel/sext_inreg.ll   |  47 +-
 .../AMDGPU/GlobalISel/shl-ext-reduce.ll       |  31 +-
 llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll    |  45 +-
 .../test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll | 115 +--
 .../test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll |  32 +-
 .../test/CodeGen/AMDGPU/GlobalISel/udivrem.ll |  39 +-
 .../test/CodeGen/AMDGPU/GlobalISel/usubsat.ll |  32 +-
 llvm/test/CodeGen/AMDGPU/global-saddr-load.ll |   6 +-
 .../CodeGen/AMDGPU/integer-mad-patterns.ll    | 285 +++----
 .../CodeGen/AMDGPU/shrink-add-sub-constant.ll |  33 +-
 39 files changed, 1521 insertions(+), 1810 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index b2132562ac3f..92c6148cd7ca 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -399,6 +399,11 @@ class CombinerHelper {
   void applyCombineTruncOfExt(MachineInstr &MI,
                               std::pair<Register, unsigned> &MatchInfo);
 
+  /// Transform (shl (and x, imm1, imm2) to (shl x, imm2)
+  ///    if (~imm1 << imm2) = 0
+  bool matchCombineShlOfAnd(MachineInstr &MI, Register &Reg);
+  void applyCombineShlOfAnd(MachineInstr &MI, Register &Reg);
+
   /// Transform trunc (shl x, K) to shl (trunc x), K
   ///    if K < VT.getScalarSizeInBits().
   ///
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3c8d968c2764..d452a08e3c42 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -822,6 +822,15 @@ def trunc_ext_fold: GICombineRule <
   (apply [{ Helper.applyCombineTruncOfExt(*${root}, ${matchinfo}); }])
 >;
 
+// Under certain conditions, transform:
+// Fold (shl (and x, imm1), imm2) -> (shl x, imm2)
+def shl_and_fold: GICombineRule <
+  (defs root:$root, register_matchinfo:$matchinfo),
+  (match (wip_match_opcode G_SHL):$root,
+         [{ return Helper.matchCombineShlOfAnd(*${root}, ${matchinfo}); }]),
+  (apply [{ Helper.applyCombineShlOfAnd(*${root}, ${matchinfo}); }])
+>;
+
 // Under certain conditions, transform:
 //  trunc (shl x, K)     -> shl (trunc x), K//
 //  trunc ([al]shr x, K) -> (trunc ([al]shr (trunc x), K))
@@ -1588,7 +1597,7 @@ def identity_combines : GICombineGroup<[select_same_val, right_identity_zero,
                                         bitcast_bitcast_fold, fptrunc_fpext_fold,
                                         right_identity_neg_zero_fp,
                                         right_identity_neg_one_fp,
-                                        combine_inttoptr_constant]>;
+                                        combine_inttoptr_constant, shl_and_fold]>;
 
 def const_combines : GICombineGroup<[constant_fold_fp_ops, const_ptradd_to_i2p,
                                      overlapping_and, mulo_by_2, mulo_by_0,
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index ec7ca5dc8e2b..64baad5439ac 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -2553,6 +2553,39 @@ void CombinerHelper::applyCombineTruncOfExt(
   MI.eraseFromParent();
 }
 
+bool CombinerHelper::matchCombineShlOfAnd(MachineInstr &MI, Register &Reg) {
+  // We're trying to match the following pattern:
+  //   %t = G_AND %x, imm1
+  //   %root = G_SHL %t, imm2
+  // -->
+  //   %root = G_SHL %x, imm2
+  // Where (~imm1 << imm2) = 0
+  assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected a G_SHL");
+  const Register DstReg = MI.getOperand(0).getReg();
+  const Register SrcReg = MI.getOperand(1).getReg();
+  const LLT SrcTy = MRI.getType(SrcReg);
+  const unsigned Size = SrcTy.getSizeInBits();
+
+  // Try to match shl (and x, imm1), imm2
+  int64_t ShiftImm, AndImm;
+  if (!mi_match(DstReg, MRI,
+                m_GShl(m_OneNonDBGUse(m_GAnd(m_Reg(Reg), m_ICst(AndImm))),
+                       m_ICst(ShiftImm))))
+    return false;
+  // Check if AndImm has bits set only in positions that will be shifted out by
+  // ShiftImm. If any significant bits remain after the shift, the AND operation
+  // cannot be removed.
+  uint64_t Mask = ~0ULL >> (64 - Size);
+  return !((~AndImm << ShiftImm) & Mask);
+}
+
+void CombinerHelper::applyCombineShlOfAnd(MachineInstr &MI, Register &Reg) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHL && "Expected a G_SHL");
+  Observer.changingInstr(MI);
+  MI.getOperand(1).setReg(Reg);
+  Observer.changedInstr(MI);
+}
+
 static LLT getMidVTForTruncRightShiftCombine(LLT ShiftTy, LLT TruncTy) {
   const unsigned ShiftSize = ShiftTy.getScalarSizeInBits();
   const unsigned TruncSize = TruncTy.getScalarSizeInBits();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir
new file mode 100644
index 000000000000..3c83b5c165fc
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shl-and.mir
@@ -0,0 +1,152 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# Modifications (c) Copyright 2024 Advanced Micro Devices, Inc. or its affiliates
+# RUN: llc -mtriple aarch64 -run-pass=aarch64-postlegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name:            test_combine_shl_of_and_I16_shift_8
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_combine_shl_of_and_I16_shift_8
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 8
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[TRUNC]], [[C]](s16)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[SHL]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s16) = G_CONSTANT i16 8
+    %2:_(s16) = G_CONSTANT i16 255
+    %3:_(s16) = G_TRUNC %0
+    %4:_(s16) = G_AND %3, %2
+    %5:_(s16) = G_SHL %4, %1
+    %6:_(s32) = G_SEXT %5
+    $w0 = COPY %6(s32)
+...
+---
+# Negative test case: Here we're trying to shift less than half size.
+name:            test_combine_shl_of_and_I16_shift_4_neg
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_combine_shl_of_and_I16_shift_4_neg
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 4
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s16) = G_CONSTANT i16 15
+    ; CHECK-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s16) = G_AND [[TRUNC]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL [[AND]], [[C]](s16)
+    ; CHECK-NEXT: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[SHL]](s16)
+    ; CHECK-NEXT: $w0 = COPY [[SEXT]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s16) = G_CONSTANT i16 4
+    %2:_(s16) = G_CONSTANT i16 15
+    %3:_(s16) = G_TRUNC %0
+    %4:_(s16) = G_AND %3, %2
+    %5:_(s16) = G_SHL %4, %1
+    %6:_(s32) = G_SEXT %5
+    $w0 = COPY %6(s32)
+...
+---
+name:            test_combine_shl_of_and_I32_shift_16
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_16
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[SHL]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s32) = G_CONSTANT i32 65535
+    %3:_(s32) = G_AND %0, %2
+    %4:_(s32) = G_SHL %3, %1
+    $w0 = COPY %4(s32)
+...
+---
+name:            test_combine_shl_of_and_I32_shift_24
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_24
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 24
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[SHL]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 24
+    %2:_(s32) = G_CONSTANT i32 16777215
+    %3:_(s32) = G_AND %0, %2
+    %4:_(s32) = G_SHL %3, %1
+    $w0 = COPY %4(s32)
+...
+---
+# Negative test case: Here we're trying to shift less than half size.
+name:            test_combine_shl_of_and_I32_shift_8_neg
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_8_neg
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[SHL]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 8
+    %2:_(s32) = G_CONSTANT i32 255
+    %3:_(s32) = G_AND %0, %2
+    %4:_(s32) = G_SHL %3, %1
+    $w0 = COPY %4(s32)
+...
+---
+# Negative test case: The AND and SHL operations cannot be combined because imm1 (255) and imm2 (16) do not satisfy the condition (~imm1 << imm2) = 0
+name:            test_combine_shl_of_and_I32_shift_16_neg
+legalized:       true
+tracksRegLiveness: true
+body: |
+  bb.0.entry:
+    liveins: $w0
+    ; CHECK-LABEL: name: test_combine_shl_of_and_I32_shift_16_neg
+    ; CHECK: liveins: $w0
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $w0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 255
+    ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[COPY]], [[C1]]
+    ; CHECK-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[SHL]](s32)
+    %0:_(s32) = COPY $w0
+    %1:_(s32) = G_CONSTANT i32 16
+    %2:_(s32) = G_CONSTANT i32 255
+    %3:_(s32) = G_AND %0, %2
+    %4:_(s32) = G_SHL %3, %1
+    $w0 = COPY %4(s32)
+...
diff --git a/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll b/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll
index 5a70d6286429..5e2a077b515d 100644
--- a/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll
+++ b/llvm/test/CodeGen/AIE/aie2/bfloat16_to_float.ll
@@ -14,10 +14,10 @@ define dso_local noundef float @bfloat16_to_float_test(%class.bfloat16 %bf.coerc
 ; CHECK:         .p2align 4
 ; CHECK-NEXT:  // %bb.0: // %entry
 ; CHECK-NEXT:    nopb ; nopa ; nops ; ret lr ; nopm ; nopv
-; CHECK-NEXT:    nop // Delay Slot 5
+; CHECK-NEXT:    nopx // Delay Slot 5
 ; CHECK-NEXT:    nop // Delay Slot 4
 ; CHECK-NEXT:    nop // Delay Slot 3
-; CHECK-NEXT:    mova r0, #16; extend.u16 r1, r1 // Delay Slot 2
+; CHECK-NEXT:    mova r0, #16 // Delay Slot 2
 ; CHECK-NEXT:    lshl r0, r1, r0 // Delay Slot 1
 entry:
   %bf.coerce.fca.0.extract = extractvalue %class.bfloat16 %bf.coerce, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
index 496ee9f2dbb2..937911dcab15 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/add.v2i16.ll
@@ -262,11 +262,10 @@ define <2 x i16> @v_add_v2i16_neg_inline_imm_hi(<2 x i16> %a) {
 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_splat:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_sub_i32 s1, s1, 64
 ; GFX7-NEXT:    s_sub_i32 s0, s0, 64
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT:    s_add_i32 s1, s1, 0xffc00000
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -304,11 +303,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_splat(<2 x i16> inreg %a) {
 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_lo:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_add_i32 s1, s1, 4
 ; GFX7-NEXT:    s_sub_i32 s0, s0, 64
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT:    s_add_i32 s1, s1, 0x40000
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -346,11 +344,10 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_lo(<2 x i16> inreg %a) {
 define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 ; GFX7-LABEL: s_add_v2i16_neg_inline_imm_hi:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_sub_i32 s1, s1, 64
 ; GFX7-NEXT:    s_add_i32 s0, s0, 4
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
-; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
+; GFX7-NEXT:    s_add_i32 s1, s1, 0xffc00000
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
@@ -388,9 +385,8 @@ define amdgpu_ps i32 @s_add_v2i16_neg_inline_imm_hi(<2 x i16> inreg %a) {
 define amdgpu_ps i32 @s_add_v2i16(<2 x i16> inreg %a, <2 x i16> inreg %b) {
 ; GFX7-LABEL: s_add_v2i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    s_add_i32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
@@ -439,9 +435,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs(<2 x half> inreg %a, <2 x i16> inreg
 ; GFX7-NEXT:    s_or_b32 s0, s1, s0
 ; GFX7-NEXT:    s_xor_b32 s0, s0, 0x80008000
 ; GFX7-NEXT:    s_lshr_b32 s1, s0, 16
-; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    s_add_i32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
@@ -495,9 +490,8 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_rhs(<2 x i16> inreg %a, <2 x half> inreg
 ; GFX7-NEXT:    s_or_b32 s2, s3, s2
 ; GFX7-NEXT:    s_xor_b32 s2, s2, 0x80008000
 ; GFX7-NEXT:    s_lshr_b32 s3, s2, 16
-; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    s_add_i32 s0, s0, s2
-; GFX7-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX7-NEXT:    s_add_i32 s1, s1, s3
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
@@ -556,11 +550,10 @@ define amdgpu_ps i32 @s_add_v2i16_fneg_lhs_fneg_rhs(<2 x half> inreg %a, <2 x ha
 ; GFX7-NEXT:    s_xor_b32 s1, s1, 0x80008000
 ; GFX7-NEXT:    s_lshr_b32 s2, s0, 16
 ; GFX7-NEXT:    s_lshr_b32 s3, s1, 16
-; GFX7-NEXT:    s_add_i32 s2, s2, s3
 ; GFX7-NEXT:    s_add_i32 s0, s0, s1
-; GFX7-NEXT:    s_and_b32 s1, s2, 0xffff
+; GFX7-NEXT:    s_add_i32 s2, s2, s3
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
-; GFX7-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX7-NEXT:    s_lshl_b32 s1, s2, 16
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
 ; GFX7-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
index 4be00fedb972..887d42e44125 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/andn2.ll
@@ -688,11 +688,9 @@ define <2 x i16> @v_andn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s6, s5, s6
 ; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
@@ -741,11 +739,9 @@ define amdgpu_ps i48 @s_andn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1
 define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v3i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s6, s5, s6
 ; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
@@ -794,7 +790,6 @@ define amdgpu_ps i48 @s_andn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inr
 define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
 ; GFX6-LABEL: s_andn2_v3i16_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
@@ -802,9 +797,8 @@ define amdgpu_ps { i48, i48 } @s_andn2_v3i16_multi_use(<3 x i16> inreg %src0, <3
 ; GFX6-NEXT:    s_or_b32 s6, s5, s6
 ; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GFX6-NEXT:    s_and_b32 s1, s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s0, s2, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX6-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
@@ -866,10 +860,8 @@ define <3 x i16> @v_andn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
 ; GFX6-LABEL: v_andn2_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index ec832ed0f7f3..70d4265c489e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -815,11 +815,10 @@ define <2 x i16> @v_ashr_v2i16_15(<2 x i16> %value) {
 define amdgpu_ps i32 @s_ashr_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: s_ashr_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, s3
+; GFX6-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s1, s1, s3
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -873,7 +872,6 @@ define amdgpu_ps float @ashr_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount)
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s1
 ; GFX6-NEXT:    v_ashr_i32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -911,7 +909,6 @@ define amdgpu_ps float @ashr_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 ; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 16
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -966,14 +963,12 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1009,21 +1004,19 @@ define <2 x float> @v_ashr_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 define amdgpu_ps <2 x i32> @s_ashr_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
 ; GFX6-LABEL: s_ashr_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, s5
+; GFX6-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, s4
+; GFX6-NEXT:    s_ashr_i32 s1, s1, s5
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX6-NEXT:    s_sext_i32_i16 s3, s3
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, s6
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, s7
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1133,7 +1126,6 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v13
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 16
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 16
@@ -1144,17 +1136,14 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 16
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v7, v8, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1200,37 +1189,33 @@ define <4 x float> @v_ashr_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 define amdgpu_ps <4 x i32> @s_ashr_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
 ; GFX6-LABEL: s_ashr_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX6-NEXT:    s_sext_i32_i16 s0, s0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, s9
+; GFX6-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, s8
+; GFX6-NEXT:    s_ashr_i32 s1, s1, s9
 ; GFX6-NEXT:    s_sext_i32_i16 s2, s2
 ; GFX6-NEXT:    s_sext_i32_i16 s3, s3
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, s10
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, s11
+; GFX6-NEXT:    s_sext_i32_i16 s4, s4
 ; GFX6-NEXT:    s_sext_i32_i16 s5, s5
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
-; GFX6-NEXT:    s_sext_i32_i16 s4, s4
+; GFX6-NEXT:    s_ashr_i32 s4, s4, s12
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, s13
+; GFX6-NEXT:    s_sext_i32_i16 s6, s6
 ; GFX6-NEXT:    s_sext_i32_i16 s7, s7
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_ashr_i32 s4, s4, s12
-; GFX6-NEXT:    s_sext_i32_i16 s6, s6
-; GFX6-NEXT:    s_ashr_i32 s7, s7, s15
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, s14
+; GFX6-NEXT:    s_ashr_i32 s7, s7, s15
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
index 6a291510fe66..c5a5b3e4750d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.postlegal.mir
@@ -113,10 +113,8 @@ body:             |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     ;
@@ -124,10 +122,8 @@ body:             |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     %narrow:_(s32) = COPY $vgpr0
@@ -151,10 +147,8 @@ body:             |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     ;
@@ -162,10 +156,8 @@ body:             |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     %narrow:_(s32) = COPY $vgpr0
@@ -189,10 +181,8 @@ body:             |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     ;
@@ -200,10 +190,8 @@ body:             |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     %narrow:_(s32) = COPY $vgpr0
@@ -227,10 +215,8 @@ body:             |
     ; GFX6: liveins: $vgpr0
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX6-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX6-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     ;
@@ -238,10 +224,8 @@ body:             |
     ; GFX9: liveins: $vgpr0
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %narrow:_(s32) = COPY $vgpr0
-    ; GFX9-NEXT: %masklow30:_(s32) = G_CONSTANT i32 1073741823
-    ; GFX9-NEXT: %masked:_(s32) = G_AND %narrow, %masklow30
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %masked, [[C]](s32)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL %narrow, [[C]](s32)
     ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s32)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     %narrow:_(s32) = COPY $vgpr0
@@ -280,10 +264,8 @@ body:             |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32)
-    ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383
-    ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16)
     ; GFX9-NEXT: %shl:_(s32) = G_ZEXT [[SHL]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY %shl(s32)
     %argument:_(s32) = COPY $vgpr0
@@ -321,10 +303,8 @@ body:             |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32)
-    ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383
-    ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16)
     ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s16)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     %argument:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
index 6ceb41199af6..fff976e1042b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-shl-from-extend-narrow.prelegal.mir
@@ -14,10 +14,8 @@ body:             |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX6-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32)
-    ; GFX6-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383
-    ; GFX6-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16)
     ; GFX6-NEXT: %shl:_(s32) = G_ZEXT [[SHL]](s16)
     ; GFX6-NEXT: $vgpr0 = COPY %shl(s32)
     ;
@@ -26,10 +24,8 @@ body:             |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32)
-    ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383
-    ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16)
     ; GFX9-NEXT: %shl:_(s32) = G_ZEXT [[SHL]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY %shl(s32)
     %argument:_(s32) = COPY $vgpr0
@@ -54,10 +50,8 @@ body:             |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX6-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32)
-    ; GFX6-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383
-    ; GFX6-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16)
     ; GFX6-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s16)
     ; GFX6-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     ;
@@ -66,10 +60,8 @@ body:             |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: %narrow:_(s16) = G_TRUNC %argument(s32)
-    ; GFX9-NEXT: %masklow14:_(s16) = G_CONSTANT i16 16383
-    ; GFX9-NEXT: %masked:_(s16) = G_AND %narrow, %masklow14
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_CONSTANT i16 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %masked, [[C]](s16)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s16) = G_SHL %narrow, [[C]](s16)
     ; GFX9-NEXT: %shl:_(s64) = G_ZEXT [[SHL]](s16)
     ; GFX9-NEXT: $vgpr0_vgpr1 = COPY %shl(s64)
     %argument:_(s32) = COPY $vgpr0
@@ -94,10 +86,8 @@ body:             |
     ; GFX6-NEXT: {{  $}}
     ; GFX6-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX6-NEXT: %narrow:_(s8) = G_TRUNC %argument(s32)
-    ; GFX6-NEXT: %masklow6:_(s8) = G_CONSTANT i8 63
-    ; GFX6-NEXT: %masked:_(s8) = G_AND %narrow, %masklow6
     ; GFX6-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2
-    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8)
+    ; GFX6-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %narrow, [[C]](s8)
     ; GFX6-NEXT: %result:_(s32) = G_ZEXT [[SHL]](s8)
     ; GFX6-NEXT: $vgpr0 = COPY %result(s32)
     ;
@@ -106,10 +96,8 @@ body:             |
     ; GFX9-NEXT: {{  $}}
     ; GFX9-NEXT: %argument:_(s32) = COPY $vgpr0
     ; GFX9-NEXT: %narrow:_(s8) = G_TRUNC %argument(s32)
-    ; GFX9-NEXT: %masklow6:_(s8) = G_CONSTANT i8 63
-    ; GFX9-NEXT: %masked:_(s8) = G_AND %narrow, %masklow6
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s8) = G_CONSTANT i8 2
-    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %masked, [[C]](s8)
+    ; GFX9-NEXT: [[SHL:%[0-9]+]]:_(s8) = G_SHL %narrow, [[C]](s8)
     ; GFX9-NEXT: %result:_(s32) = G_ZEXT [[SHL]](s8)
     ; GFX9-NEXT: $vgpr0 = COPY %result(s32)
     %argument:_(s32) = COPY $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
index 02781e763f44..0416d7046418 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll
@@ -751,18 +751,17 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v1, v5
 ; SI-NEXT:    v_cvt_f32_ubyte0_e32 v2, v7
 ; SI-NEXT:    v_and_b32_e32 v5, 0xff, v8
-; SI-NEXT:    v_add_i32_e32 v4, vcc, 9, v4
 ; SI-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; SI-NEXT:    v_and_b32_e32 v7, 0xff, v9
 ; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt expcnt(0)
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v5
-; SI-NEXT:    v_and_b32_e32 v4, 0xff, v4
+; SI-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
 ; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
 ; SI-NEXT:    v_or_b32_e32 v0, v6, v0
-; SI-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
+; SI-NEXT:    v_add_i32_e32 v4, vcc, 0x9000000, v4
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_or_b32_e32 v0, v0, v2
+; SI-NEXT:    v_or_b32_e32 v0, v0, v4
 ; SI-NEXT:    s_mov_b64 s[4:5], s[2:3]
 ; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
@@ -789,20 +788,19 @@ define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(ptr addrspace(1) noalias %o
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte3_e32 v3, v1
 ; VI-NEXT:    v_add_u16_e32 v9, 9, v1
-; VI-NEXT:    v_add_u16_sdwa v10, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; VI-NEXT:    v_add_u16_sdwa v10, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_cvt_f32_ubyte0_sdwa v2, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; VI-NEXT:    v_add_u16_e32 v7, 9, v7
 ; VI-NEXT:    v_add_u16_e32 v8, 9, v8
 ; VI-NEXT:    flat_store_dwordx4 v[4:5], v[0:3]
-; VI-NEXT:    v_and_b32_e32 v10, 0xff, v10
+; VI-NEXT:    s_nop 0
 ; VI-NEXT:    v_lshlrev_b32_sdwa v0, v6, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; VI-NEXT:    v_and_b32_e32 v1, 0xff, v8
 ; VI-NEXT:    v_or_b32_sdwa v0, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; VI-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; VI-NEXT:    v_lshlrev_b32_e32 v2, 24, v10
 ; VI-NEXT:    v_or_b32_e32 v0, v0, v1
-; VI-NEXT:    v_or_b32_e32 v2, v0, v2
+; VI-NEXT:    v_or_b32_e32 v2, v0, v10
 ; VI-NEXT:    v_mov_b32_e32 v0, s2
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_store_dword v[0:1], v2
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index f9b98059be0b..53db0574ae48 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -709,7 +709,6 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX8-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX8-NEXT:    s_or_b32 s1, s1, s2
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -736,7 +735,6 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9-NEXT:    s_lshr_b32 s3, s3, 1
 ; GFX9-NEXT:    s_lshr_b32 s2, s3, s2
 ; GFX9-NEXT:    s_or_b32 s1, s1, s2
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -745,25 +743,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-LABEL: s_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s6, s5, 7
-; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
-; GFX10-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX10-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX10-NEXT:    s_or_b32 s2, s3, s4
+; GFX10-NEXT:    s_and_b32 s2, s4, 0xff
+; GFX10-NEXT:    s_and_b32 s4, s5, 7
+; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX10-NEXT:    s_lshr_b32 s2, s2, s5
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX10-NEXT:    s_or_b32 s1, s3, s2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -772,25 +769,24 @@ define amdgpu_ps i16 @s_fshl_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX11-LABEL: s_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s4, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s6, s5, 7
-; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
-; GFX11-NEXT:    s_lshr_b32 s4, s4, 1
+; GFX11-NEXT:    s_and_b32 s6, s2, 7
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX11-NEXT:    s_lshr_b32 s4, s4, s5
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX11-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-NEXT:    s_and_b32 s2, s4, 0xff
+; GFX11-NEXT:    s_and_b32 s4, s5, 7
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s6
+; GFX11-NEXT:    s_lshl_b32 s3, s3, s4
+; GFX11-NEXT:    s_lshr_b32 s2, s2, s5
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -851,9 +847,7 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
-; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -877,65 +871,59 @@ define i16 @v_fshl_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, v1, v3
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v3, v6, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v2, v2, v3
-; GFX9-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v7, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX10-NEXT:    v_not_b32_e32 v6, v2
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_not_b32_e32 v6, v3
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX10-NEXT:    v_not_b32_e32 v7, v4
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX10-NEXT:    v_lshrrev_b16 v4, 1, v4
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_lshlrev_b16 v3, v3, v5
+; GFX10-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_lshrrev_b16 v5, 1, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v7, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v3, v4
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT:    v_lshrrev_b16 v1, v6, v1
+; GFX10-NEXT:    v_lshlrev_b16 v2, v4, v3
+; GFX10-NEXT:    v_lshrrev_b16 v3, v7, v5
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshl_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v1
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v0
-; GFX11-NEXT:    v_not_b32_e32 v7, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
+; GFX11-NEXT:    v_not_b32_e32 v6, v2
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX11-NEXT:    v_not_b32_e32 v7, v4
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
-; GFX11-NEXT:    v_lshrrev_b16 v4, 1, v4
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX11-NEXT:    v_lshlrev_b16 v3, v3, v5
+; GFX11-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX11-NEXT:    v_lshrrev_b16 v5, 1, v5
 ; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshrrev_b16 v4, v6, v4
-; GFX11-NEXT:    v_lshrrev_b16 v1, v7, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v2, v3, v4
+; GFX11-NEXT:    v_lshrrev_b16 v1, v6, v1
+; GFX11-NEXT:    v_lshlrev_b16 v2, v4, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshrrev_b16 v3, v7, v5
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
@@ -977,20 +965,19 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s7
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, 1
 ; GFX6-NEXT:    s_lshr_b32 s4, s4, s6
+; GFX6-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    s_and_b32 s4, s8, 7
 ; GFX6-NEXT:    s_andn2_b32 s6, 7, s8
 ; GFX6-NEXT:    s_lshr_b32 s1, s1, 25
-; GFX6-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX6-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 8
-; GFX6-NEXT:    s_or_b32 s1, s4, s1
+; GFX6-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX6-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX6-NEXT:    s_or_b32 s1, s4, s1
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -1044,8 +1031,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_or_b32 s3, s3, s4
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1097,8 +1083,7 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_or_b32 s3, s3, s4
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
@@ -1108,48 +1093,47 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_and_b32 s11, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
 ; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX10-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX10-NEXT:    s_and_b32 s6, s9, 7
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX10-NEXT:    s_and_b32 s3, s10, 7
-; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s10
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX10-NEXT:    s_and_b32 s4, s11, 7
-; GFX10-NEXT:    s_andn2_b32 s6, 7, s11
-; GFX10-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX10-NEXT:    s_lshr_b32 s5, s7, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s1, s6, 0xff
+; GFX10-NEXT:    s_and_b32 s6, s9, 7
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
+; GFX10-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s9
+; GFX10-NEXT:    s_and_b32 s7, s10, 7
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s10
+; GFX10-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-NEXT:    s_lshl_b32 s4, s4, s7
+; GFX10-NEXT:    s_lshr_b32 s6, s6, s9
+; GFX10-NEXT:    s_or_b32 s1, s3, s1
+; GFX10-NEXT:    s_or_b32 s3, s4, s6
+; GFX10-NEXT:    s_and_b32 s4, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_lshr_b32 s6, s8, 1
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX10-NEXT:    s_or_b32 s2, s4, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -1161,48 +1145,47 @@ define amdgpu_ps i32 @s_fshl_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_and_b32 s11, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX11-NEXT:    s_and_b32 s12, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
 ; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
 ; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s2
-; GFX11-NEXT:    s_and_b32 s2, s6, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s9, 7
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
 ; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
-; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s9
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s11
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_and_b32 s2, s7, 0xff
-; GFX11-NEXT:    s_and_b32 s3, s10, 7
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s10
-; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s2, s6
-; GFX11-NEXT:    s_and_b32 s4, s11, 7
-; GFX11-NEXT:    s_and_not1_b32 s6, 7, s11
-; GFX11-NEXT:    s_lshr_b32 s7, s8, 1
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-NEXT:    s_lshr_b32 s5, s7, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s1, s6, 0xff
+; GFX11-NEXT:    s_and_b32 s6, s9, 7
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_lshl_b32 s3, s3, s6
+; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
+; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
+; GFX11-NEXT:    s_lshr_b32 s1, s1, 1
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s9
+; GFX11-NEXT:    s_and_b32 s7, s10, 7
+; GFX11-NEXT:    s_and_not1_b32 s9, 7, s10
+; GFX11-NEXT:    s_lshr_b32 s6, s6, 1
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX11-NEXT:    s_lshl_b32 s4, s4, s7
+; GFX11-NEXT:    s_lshr_b32 s6, s6, s9
+; GFX11-NEXT:    s_or_b32 s1, s3, s1
+; GFX11-NEXT:    s_or_b32 s3, s4, s6
+; GFX11-NEXT:    s_and_b32 s4, s2, 7
+; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_lshr_b32 s6, s8, 1
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX11-NEXT:    s_or_b32 s2, s4, s2
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1249,20 +1232,19 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 1, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, v6, v4
 ; GFX6-NEXT:    v_not_b32_e32 v6, v8
+; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v2
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v4, 7, v8
 ; GFX6-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v1, 25, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
-; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v4, v5
+; GFX6-NEXT:    v_lshrrev_b32_e32 v1, v6, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX6-NEXT:    v_or_b32_e32 v1, v4, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -1306,15 +1288,13 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1356,67 +1336,63 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, v6, v1
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v9, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshl_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v2
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
-; GFX10-NEXT:    v_not_b32_e32 v9, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_lshlrev_b16 v0, v10, v0
-; GFX10-NEXT:    v_not_b32_e32 v10, v8
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_lshlrev_b16 v0, v9, v0
+; GFX10-NEXT:    v_not_b32_e32 v9, v10
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0xff, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
-; GFX10-NEXT:    v_lshlrev_b16 v3, v8, v3
-; GFX10-NEXT:    v_not_b32_e32 v8, v11
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_not_b32_e32 v13, v2
 ; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
 ; GFX10-NEXT:    v_lshrrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
+; GFX10-NEXT:    v_not_b32_e32 v14, v11
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_not_b32_e32 v8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX10-NEXT:    v_lshlrev_b16 v3, v10, v3
+; GFX10-NEXT:    v_lshrrev_b16 v6, v9, v6
+; GFX10-NEXT:    v_and_b32_e32 v9, 7, v11
+; GFX10-NEXT:    v_and_b32_e32 v10, 7, v14
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 1, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_and_b32_e32 v13, 7, v13
-; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
-; GFX10-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
 ; GFX10-NEXT:    v_lshrrev_b16 v12, 1, v12
-; GFX10-NEXT:    v_lshrrev_b16 v6, v10, v6
-; GFX10-NEXT:    v_lshlrev_b16 v4, v11, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v8, v1
-; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
-; GFX10-NEXT:    v_lshrrev_b16 v5, v13, v7
-; GFX10-NEXT:    v_lshrrev_b16 v7, v9, v12
+; GFX10-NEXT:    v_not_b32_e32 v11, v2
+; GFX10-NEXT:    v_lshlrev_b16 v4, v9, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v10, v1
 ; GFX10-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX10-NEXT:    v_mov_b32_e32 v6, 8
+; GFX10-NEXT:    v_lshrrev_b16 v8, v8, v12
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v11
+; GFX10-NEXT:    v_lshrrev_b16 v7, 1, v7
+; GFX10-NEXT:    v_mov_b32_e32 v9, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
-; GFX10-NEXT:    v_or_b32_e32 v0, v0, v7
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v6, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v8
+; GFX10-NEXT:    v_lshlrev_b16 v2, v2, v5
+; GFX10-NEXT:    v_lshrrev_b16 v4, v6, v7
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1425,55 +1401,53 @@ define i32 @v_fshl_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 16, v2
+; GFX11-NEXT:    v_and_b32_e32 v12, 7, v2
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX11-NEXT:    v_not_b32_e32 v13, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 24, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
+; GFX11-NEXT:    v_not_b32_e32 v11, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 24, v2
+; GFX11-NEXT:    v_lshlrev_b16 v0, v12, v0
 ; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
-; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
+; GFX11-NEXT:    v_and_b32_e32 v12, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v6, 1, v6
-; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
+; GFX11-NEXT:    v_not_b32_e32 v13, v10
 ; GFX11-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX11-NEXT:    v_lshlrev_b16 v3, v9, v3
-; GFX11-NEXT:    v_not_b32_e32 v9, v10
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
-; GFX11-NEXT:    v_lshrrev_b16 v6, v13, v6
-; GFX11-NEXT:    v_not_b32_e32 v13, v11
-; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
-; GFX11-NEXT:    v_and_b32_e32 v12, 7, v2
-; GFX11-NEXT:    v_not_b32_e32 v2, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 24, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX11-NEXT:    v_lshlrev_b16 v3, v9, v3
+; GFX11-NEXT:    v_lshrrev_b16 v6, v12, v6
+; GFX11-NEXT:    v_and_b32_e32 v9, 7, v10
+; GFX11-NEXT:    v_and_b32_e32 v10, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v7, 1, v7
+; GFX11-NEXT:    v_not_b32_e32 v12, v2
 ; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
-; GFX11-NEXT:    v_lshrrev_b16 v8, 1, v8
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 1, v1
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX11-NEXT:    v_lshlrev_b16 v4, v10, v4
-; GFX11-NEXT:    v_lshrrev_b16 v6, v9, v7
-; GFX11-NEXT:    v_lshlrev_b16 v5, v11, v5
-; GFX11-NEXT:    v_lshrrev_b16 v7, v13, v8
-; GFX11-NEXT:    v_lshlrev_b16 v0, v12, v0
-; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b16 v4, v9, v4
+; GFX11-NEXT:    v_lshrrev_b16 v6, v10, v7
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v12
+; GFX11-NEXT:    v_lshrrev_b16 v8, 1, v8
+; GFX11-NEXT:    v_lshrrev_b16 v1, v11, v1
+; GFX11-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_lshlrev_b16 v2, v2, v5
+; GFX11-NEXT:    v_lshrrev_b16 v5, v7, v8
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
+; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -1911,8 +1885,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_bfe_u32 v2, v0, 8, 8
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
@@ -1925,7 +1898,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-LABEL: s_fshl_v2i24:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
@@ -1933,47 +1905,43 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX8-NEXT:    s_or_b32 s1, s8, s1
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s1, s1, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s2, 8
-; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s6
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_or_b32 s3, s8, s3
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s6
 ; GFX8-NEXT:    s_lshr_b32 s6, s4, 8
-; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
@@ -1988,10 +1956,9 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s9, s5, 8
-; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s6, s9, 0xff
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_or_b32 s5, s8, s5
 ; GFX8-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
@@ -2033,8 +2000,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
@@ -2045,7 +2011,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-LABEL: s_fshl_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
@@ -2053,47 +2018,43 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX9-NEXT:    s_or_b32 s1, s8, s1
 ; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s1, s1, s6
 ; GFX9-NEXT:    s_lshr_b32 s6, s2, 8
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    s_lshr_b32 s7, s2, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s2, 24
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX9-NEXT:    s_lshr_b32 s9, s3, 8
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_or_b32 s2, s2, s6
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    s_or_b32 s3, s8, s3
 ; GFX9-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s6
 ; GFX9-NEXT:    s_lshr_b32 s6, s4, 8
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s7, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s8, s4, 24
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
@@ -2108,7 +2069,6 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX9-NEXT:    s_lshr_b32 s9, s5, 8
-; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX9-NEXT:    s_and_b32 s6, s9, 0xff
 ; GFX9-NEXT:    s_or_b32 s5, s8, s5
@@ -2151,8 +2111,7 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_and_or_b32 v2, v1, v2, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 8, 8
 ; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 8
@@ -2164,116 +2123,109 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-LABEL: s_fshl_v2i24:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX10-NEXT:    s_lshr_b32 s10, s4, 16
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX10-NEXT:    s_lshr_b32 s7, s4, 8
-; GFX10-NEXT:    s_lshr_b32 s11, s4, 24
-; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX10-NEXT:    s_lshr_b32 s14, s4, 8
+; GFX10-NEXT:    s_lshr_b32 s15, s4, 16
+; GFX10-NEXT:    s_lshr_b32 s16, s4, 24
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX10-NEXT:    s_lshr_b32 s12, s5, 8
-; GFX10-NEXT:    s_or_b32 s4, s4, s7
-; GFX10-NEXT:    s_and_b32 s7, s10, 0xff
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX10-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
-; GFX10-NEXT:    s_or_b32 s4, s4, s7
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX10-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX10-NEXT:    s_or_b32 s4, s4, s14
+; GFX10-NEXT:    s_and_b32 s14, 0xffff, s15
+; GFX10-NEXT:    s_lshr_b32 s17, s5, 8
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_and_b32 s7, s12, 0xff
-; GFX10-NEXT:    s_or_b32 s5, s11, s5
-; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX10-NEXT:    s_or_b32 s5, s16, s5
+; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX10-NEXT:    s_and_b32 s16, s17, 0xff
+; GFX10-NEXT:    s_or_b32 s4, s4, s14
+; GFX10-NEXT:    s_and_b32 s15, 0xffff, s16
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT:    s_or_b32 s5, s5, s7
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    s_lshl_b32 s14, s15, 16
+; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX10-NEXT:    s_or_b32 s5, s5, s14
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-NEXT:    v_mul_lo_u32 v2, 0xffffffe8, v0
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_and_b32 s7, s9, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 16
-; GFX10-NEXT:    s_or_b32 s1, s8, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    s_lshr_b32 s8, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 24
-; GFX10-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX10-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX10-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_or_b32 s2, s2, s8
-; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX10-NEXT:    s_or_b32 s1, s8, s1
+; GFX10-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX10-NEXT:    s_lshl_b32 s9, s10, 8
+; GFX10-NEXT:    s_and_b32 s10, s11, 0xff
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s10
+; GFX10-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
-; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    v_mul_hi_u32 v2, s4, v0
+; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX10-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    s_or_b32 s1, s1, s7
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT:    s_lshr_b32 s4, s3, 8
-; GFX10-NEXT:    s_and_b32 s5, s9, 0xff
-; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX10-NEXT:    s_or_b32 s3, s10, s3
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX10-NEXT:    s_and_b32 s11, s13, 0xff
+; GFX10-NEXT:    s_or_b32 s2, s2, s6
+; GFX10-NEXT:    s_or_b32 s3, s12, s3
+; GFX10-NEXT:    v_mul_lo_u32 v2, v2, 24
+; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 8
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s4, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s7
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s11
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, 24, v0
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 16
+; GFX10-NEXT:    s_or_b32 s0, s0, s4
+; GFX10-NEXT:    s_or_b32 s3, s3, s7
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_or_b32 s3, s3, s4
-; GFX10-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s8
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v4, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX10-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
-; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
+; GFX10-NEXT:    v_sub_nc_u32_e32 v4, 23, v0
+; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v3, v3, s2
-; GFX10-NEXT:    v_mov_b32_e32 v2, 8
-; GFX10-NEXT:    v_lshl_or_b32 v0, s1, v0, v3
+; GFX10-NEXT:    v_and_b32_e32 v4, 0xffffff, v4
+; GFX10-NEXT:    s_lshr_b32 s2, s3, 1
+; GFX10-NEXT:    v_lshl_or_b32 v2, s0, v2, v3
+; GFX10-NEXT:    v_lshrrev_b32_e64 v4, v4, s2
+; GFX10-NEXT:    s_lshl_b32 s2, s5, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 16
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX10-NEXT:    s_or_b32 s0, s1, s2
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v0, v4
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v2, v1
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v0
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 8, 8
 ; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX10-NEXT:    v_or3_b32 v1, v1, v3, v2
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
+; GFX10-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v5
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2281,130 +2233,124 @@ define amdgpu_ps i48 @s_fshl_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-LABEL: s_fshl_v2i24:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
+; GFX11-NEXT:    s_lshr_b32 s14, s4, 8
+; GFX11-NEXT:    s_lshr_b32 s15, s4, 16
+; GFX11-NEXT:    s_lshr_b32 s16, s4, 24
+; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 8
+; GFX11-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX11-NEXT:    s_or_b32 s4, s4, s14
+; GFX11-NEXT:    s_and_b32 s14, 0xffff, s15
+; GFX11-NEXT:    s_lshr_b32 s17, s5, 8
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
+; GFX11-NEXT:    s_or_b32 s4, s4, s14
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_and_b32 s14, s17, 0xff
+; GFX11-NEXT:    s_or_b32 s5, s16, s5
+; GFX11-NEXT:    s_and_b32 s14, 0xffff, s14
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX11-NEXT:    s_or_b32 s5, s5, s14
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
+; GFX11-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 8
+; GFX11-NEXT:    s_lshr_b32 s11, s2, 16
+; GFX11-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s1, s8, s1
+; GFX11-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX11-NEXT:    s_lshl_b32 s9, s10, 8
+; GFX11-NEXT:    s_and_b32 s10, s11, 0xff
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
 ; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
+; GFX11-NEXT:    s_lshr_b32 s13, s3, 8
 ; GFX11-NEXT:    s_or_b32 s0, s0, s6
+; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v0
+; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
 ; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s7, s4, 16
+; GFX11-NEXT:    s_or_b32 s3, s12, s3
 ; GFX11-NEXT:    s_or_b32 s0, s0, s6
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX11-NEXT:    s_lshr_b32 s6, s4, 8
-; GFX11-NEXT:    s_lshr_b32 s10, s4, 24
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX11-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX11-NEXT:    s_or_b32 s4, s4, s6
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX11-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX11-NEXT:    s_or_b32 s4, s4, s6
-; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-NEXT:    s_and_b32 s6, s11, 0xff
-; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX11-NEXT:    s_or_b32 s5, s10, s5
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX11-NEXT:    s_or_b32 s5, s5, s6
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s7, s2, 8
-; GFX11-NEXT:    s_or_b32 s1, s8, s1
-; GFX11-NEXT:    s_lshr_b32 s8, s2, 16
-; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v0
-; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s9, 0xff
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 24
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s7, s7, 8
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT:    s_or_b32 s2, s2, s7
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s8
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
+; GFX11-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
-; GFX11-NEXT:    s_and_b32 s5, s8, 0xff
-; GFX11-NEXT:    s_lshr_b32 s4, s3, 8
-; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX11-NEXT:    s_and_b32 s5, 0xffff, s10
+; GFX11-NEXT:    s_and_b32 s4, s13, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 16
-; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_or_b32 s2, s2, s5
-; GFX11-NEXT:    s_or_b32 s3, s9, s3
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
 ; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX11-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    s_lshr_b32 s3, s3, 1
-; GFX11-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
-; GFX11-NEXT:    s_lshl_b32 s2, s6, 16
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
-; GFX11-NEXT:    s_or_b32 s0, s1, s2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 8, 8
-; GFX11-NEXT:    v_lshrrev_b32_e64 v3, v3, s3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v2, s2
+; GFX11-NEXT:    s_or_b32 s2, s3, s4
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v0, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
+; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v1, v2
+; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
+; GFX11-NEXT:    s_or_b32 s0, s1, s7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
+; GFX11-NEXT:    v_lshrrev_b32_e64 v2, v3, s2
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v0, v2
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v0
 ; GFX11-NEXT:    v_bfe_u32 v5, v0, 8, 8
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v3
 ; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
+; GFX11-NEXT:    v_or3_b32 v1, v1, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v5
-; GFX11-NEXT:    v_or3_b32 v1, v1, v3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX11-NEXT:    ; return to shader part epilog
   %lhs = bitcast i48 %lhs.arg to <2 x i24>
   %rhs = bitcast i48 %rhs.arg to <2 x i24>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index c8455665e7b4..cd5375886768 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -706,7 +706,6 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshr_b32 s1, s3, s1
 ; GFX8-NEXT:    s_or_b32 s1, s2, s1
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -733,7 +732,6 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshr_b32 s1, s3, s1
 ; GFX9-NEXT:    s_or_b32 s1, s2, s1
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
@@ -746,21 +744,20 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX10-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX10-NEXT:    s_and_b32 s6, s2, 7
 ; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
+; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX10-NEXT:    s_and_b32 s2, s5, 7
 ; GFX10-NEXT:    s_andn2_b32 s5, 7, s5
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s5
 ; GFX10-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX10-NEXT:    s_or_b32 s1, s3, s2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -773,21 +770,20 @@ define amdgpu_ps i16 @s_fshr_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg, i16 in
 ; GFX11-NEXT:    s_lshr_b32 s5, s2, 8
 ; GFX11-NEXT:    s_and_b32 s6, s2, 7
 ; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
+; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
 ; GFX11-NEXT:    s_and_b32 s2, s5, 7
 ; GFX11-NEXT:    s_and_not1_b32 s5, 7, s5
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s6
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, s5
 ; GFX11-NEXT:    s_lshr_b32 s2, s4, s2
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_and_b32 s1, s2, 0xff
+; GFX11-NEXT:    s_or_b32 s1, s3, s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -848,9 +844,7 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX8-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -873,65 +867,59 @@ define i16 @v_fshr_v2i8(i16 %lhs.arg, i16 %rhs.arg, i16 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 1, v3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, v2, v3
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX9-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
+; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v2i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v6, 7, v2
-; GFX10-NEXT:    v_not_b32_e32 v2, v2
-; GFX10-NEXT:    v_not_b32_e32 v7, v3
-; GFX10-NEXT:    v_and_b32_e32 v3, 7, v3
-; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX10-NEXT:    v_not_b32_e32 v6, v2
+; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX10-NEXT:    v_not_b32_e32 v7, v4
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_lshrrev_b16 v3, v3, v5
-; GFX10-NEXT:    v_lshlrev_b16 v4, v7, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v6, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX10-NEXT:    v_or_b32_e32 v2, v4, v3
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
+; GFX10-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX10-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX10-NEXT:    v_lshrrev_b16 v1, v2, v1
+; GFX10-NEXT:    v_lshlrev_b16 v0, v6, v0
+; GFX10-NEXT:    v_lshlrev_b16 v2, v7, v3
+; GFX10-NEXT:    v_lshrrev_b16 v3, v4, v5
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v2i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 8, v2
+; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v2
-; GFX11-NEXT:    v_not_b32_e32 v2, v2
-; GFX11-NEXT:    v_not_b32_e32 v6, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 7, v3
-; GFX11-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_not_b32_e32 v6, v2
+; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
+; GFX11-NEXT:    v_not_b32_e32 v7, v4
 ; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_lshrrev_b16 v3, v3, v5
+; GFX11-NEXT:    v_and_b32_e32 v6, 7, v6
+; GFX11-NEXT:    v_and_b32_e32 v4, 7, v4
+; GFX11-NEXT:    v_lshlrev_b16 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
+; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
+; GFX11-NEXT:    v_lshlrev_b16 v0, v6, v0
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_lshlrev_b16 v4, v6, v4
-; GFX11-NEXT:    v_lshrrev_b16 v1, v7, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_lshlrev_b16 v0, v2, v0
-; GFX11-NEXT:    v_or_b32_e32 v2, v4, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b16 v2, v7, v3
+; GFX11-NEXT:    v_lshrrev_b16 v3, v4, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
+; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
@@ -990,8 +978,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX6-NEXT:    s_or_b32 s0, s0, s2
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1043,8 +1030,7 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX8-NEXT:    s_or_b32 s3, s4, s3
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -1096,59 +1082,57 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX9-NEXT:    s_or_b32 s3, s4, s3
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
-; GFX9-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX9-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX9-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX9-NEXT:    s_or_b32 s0, s0, s1
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX10-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX10-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX10-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX10-NEXT:    s_and_b32 s12, s2, 7
-; GFX10-NEXT:    s_andn2_b32 s2, 7, s2
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX10-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX10-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX10-NEXT:    s_and_b32 s11, s2, 7
+; GFX10-NEXT:    s_andn2_b32 s12, 7, s2
 ; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX10-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX10-NEXT:    s_and_b32 s2, s9, 7
+; GFX10-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX10-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX10-NEXT:    s_lshr_b32 s1, s1, s11
+; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX10-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX10-NEXT:    s_or_b32 s0, s0, s1
+; GFX10-NEXT:    s_and_b32 s1, s9, 7
 ; GFX10-NEXT:    s_andn2_b32 s9, 7, s9
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, s9
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_or_b32 s1, s3, s2
-; GFX10-NEXT:    s_and_b32 s2, s10, 7
-; GFX10-NEXT:    s_andn2_b32 s3, 7, s10
+; GFX10-NEXT:    s_lshr_b32 s1, s6, s1
+; GFX10-NEXT:    s_and_b32 s6, s10, 7
+; GFX10-NEXT:    s_andn2_b32 s9, 7, s10
 ; GFX10-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX10-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX10-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX10-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX10-NEXT:    s_andn2_b32 s4, 7, s11
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX10-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX10-NEXT:    s_lshl_b32 s4, s4, s9
+; GFX10-NEXT:    s_lshr_b32 s6, s7, s6
+; GFX10-NEXT:    s_or_b32 s1, s3, s1
+; GFX10-NEXT:    s_or_b32 s3, s4, s6
+; GFX10-NEXT:    s_andn2_b32 s4, 7, s2
 ; GFX10-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX10-NEXT:    s_and_b32 s6, s11, 7
-; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX10-NEXT:    s_lshr_b32 s5, s8, s6
-; GFX10-NEXT:    s_or_b32 s2, s3, s2
+; GFX10-NEXT:    s_and_b32 s2, s2, 7
 ; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX10-NEXT:    s_or_b32 s3, s4, s5
+; GFX10-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX10-NEXT:    s_lshr_b32 s2, s8, s2
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
-; GFX10-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX10-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX10-NEXT:    s_or_b32 s2, s4, s2
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
 ; GFX10-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX10-NEXT:    s_or_b32 s0, s0, s1
@@ -1157,51 +1141,50 @@ define amdgpu_ps i32 @s_fshr_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg, i32 in
 ; GFX11-LABEL: s_fshr_v4i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshr_b32 s6, s1, 8
-; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
-; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
 ; GFX11-NEXT:    s_lshr_b32 s7, s1, 16
 ; GFX11-NEXT:    s_lshr_b32 s8, s1, 24
-; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
-; GFX11-NEXT:    s_lshr_b32 s11, s2, 24
-; GFX11-NEXT:    s_and_b32 s12, s2, 7
-; GFX11-NEXT:    s_and_not1_b32 s2, 7, s2
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_lshr_b32 s3, s0, 8
+; GFX11-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX11-NEXT:    s_lshr_b32 s5, s0, 24
+; GFX11-NEXT:    s_and_b32 s11, s2, 7
+; GFX11-NEXT:    s_and_not1_b32 s12, 7, s2
 ; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX11-NEXT:    s_and_b32 s2, s9, 7
+; GFX11-NEXT:    s_lshr_b32 s9, s2, 8
+; GFX11-NEXT:    s_lshl_b32 s0, s0, s12
+; GFX11-NEXT:    s_lshr_b32 s1, s1, s11
+; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 16
+; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_and_b32 s1, s9, 7
 ; GFX11-NEXT:    s_and_not1_b32 s9, 7, s9
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 1
 ; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshr_b32 s1, s1, s12
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, s9
-; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX11-NEXT:    s_and_b32 s6, s7, 0xff
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_or_b32 s1, s3, s2
-; GFX11-NEXT:    s_and_b32 s2, s10, 7
-; GFX11-NEXT:    s_and_not1_b32 s3, 7, s10
+; GFX11-NEXT:    s_lshr_b32 s1, s6, s1
+; GFX11-NEXT:    s_and_b32 s6, s10, 7
+; GFX11-NEXT:    s_and_not1_b32 s9, 7, s10
 ; GFX11-NEXT:    s_lshl_b32 s4, s4, 1
-; GFX11-NEXT:    s_and_b32 s6, 0xffff, s6
-; GFX11-NEXT:    s_lshl_b32 s3, s4, s3
-; GFX11-NEXT:    s_lshr_b32 s2, s6, s2
-; GFX11-NEXT:    s_and_not1_b32 s4, 7, s11
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX11-NEXT:    s_lshr_b32 s2, s2, 24
+; GFX11-NEXT:    s_lshl_b32 s4, s4, s9
+; GFX11-NEXT:    s_lshr_b32 s6, s7, s6
+; GFX11-NEXT:    s_or_b32 s1, s3, s1
+; GFX11-NEXT:    s_or_b32 s3, s4, s6
+; GFX11-NEXT:    s_and_not1_b32 s4, 7, s2
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 1
-; GFX11-NEXT:    s_and_b32 s6, s11, 7
-; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
-; GFX11-NEXT:    s_lshr_b32 s5, s8, s6
-; GFX11-NEXT:    s_or_b32 s2, s3, s2
+; GFX11-NEXT:    s_and_b32 s2, s2, 7
 ; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
-; GFX11-NEXT:    s_or_b32 s3, s4, s5
+; GFX11-NEXT:    s_lshl_b32 s4, s5, s4
+; GFX11-NEXT:    s_lshr_b32 s2, s8, s2
 ; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
+; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
-; GFX11-NEXT:    s_lshl_b32 s1, s2, 16
-; GFX11-NEXT:    s_and_b32 s2, s3, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s3, 16
+; GFX11-NEXT:    s_or_b32 s2, s4, s2
 ; GFX11-NEXT:    s_or_b32 s0, s0, s1
 ; GFX11-NEXT:    s_lshl_b32 s1, s2, 24
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
@@ -1264,8 +1247,7 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1307,15 +1289,13 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX8-NEXT:    v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
 ; GFX8-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX8-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_e32 v2, 0xff, v4
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX8-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1357,124 +1337,118 @@ define i32 @v_fshr_v4i8(i32 %lhs.arg, i32 %rhs.arg, i32 %amt.arg) {
 ; GFX9-NEXT:    v_lshlrev_b16_sdwa v0, v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, v7, v0
 ; GFX9-NEXT:    v_lshrrev_b16_sdwa v1, v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-; GFX9-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 8
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX9-NEXT:    v_and_or_b32 v1, v2, v9, v1
 ; GFX9-NEXT:    v_and_b32_e32 v2, 0xff, v4
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v0, v1, v2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10-LABEL: v_fshr_v4i8:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 8, v2
+; GFX10-NEXT:    v_not_b32_e32 v5, v2
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
-; GFX10-NEXT:    v_not_b32_e32 v8, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX10-NEXT:    v_not_b32_e32 v10, v5
-; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 24, v0
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
 ; GFX10-NEXT:    v_lshlrev_b16 v0, 1, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 7, v8
-; GFX10-NEXT:    v_mov_b32_e32 v13, 0xff
-; GFX10-NEXT:    v_not_b32_e32 v14, v12
-; GFX10-NEXT:    v_lshlrev_b16 v3, v10, v3
-; GFX10-NEXT:    v_not_b32_e32 v10, v11
+; GFX10-NEXT:    v_not_b32_e32 v10, v7
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_lshlrev_b16 v3, 1, v3
+; GFX10-NEXT:    v_lshlrev_b16 v0, v5, v0
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v10
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v7
+; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v8
+; GFX10-NEXT:    v_not_b32_e32 v13, v11
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX10-NEXT:    v_lshlrev_b16 v0, v8, v0
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xff, v1
-; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
-; GFX10-NEXT:    v_and_b32_e32 v7, 0xff, v7
-; GFX10-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX10-NEXT:    v_and_b32_e32 v10, 7, v10
+; GFX10-NEXT:    v_lshlrev_b16 v3, v5, v3
+; GFX10-NEXT:    v_mov_b32_e32 v5, 0xff
+; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
+; GFX10-NEXT:    v_and_b32_e32 v10, 0xff, v1
+; GFX10-NEXT:    v_lshrrev_b16 v7, v7, v8
+; GFX10-NEXT:    v_and_b32_e32 v8, 7, v11
+; GFX10-NEXT:    v_and_b32_e32 v11, 7, v13
 ; GFX10-NEXT:    v_lshlrev_b16 v4, 1, v4
-; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_e32 v13, 7, v14
-; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
-; GFX10-NEXT:    v_and_b32_e32 v12, 7, v12
+; GFX10-NEXT:    v_and_b32_sdwa v1, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX10-NEXT:    v_lshrrev_b16 v5, v5, v7
-; GFX10-NEXT:    v_lshlrev_b16 v4, v10, v4
-; GFX10-NEXT:    v_lshrrev_b16 v1, v11, v1
-; GFX10-NEXT:    v_lshlrev_b16 v6, v13, v6
-; GFX10-NEXT:    v_lshrrev_b16 v7, v12, v9
-; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v8
-; GFX10-NEXT:    v_or_b32_e32 v3, v3, v5
-; GFX10-NEXT:    v_mov_b32_e32 v5, 8
+; GFX10-NEXT:    v_not_b32_e32 v5, v12
+; GFX10-NEXT:    v_or_b32_e32 v3, v3, v7
+; GFX10-NEXT:    v_lshlrev_b16 v4, v11, v4
+; GFX10-NEXT:    v_lshrrev_b16 v1, v8, v1
+; GFX10-NEXT:    v_lshrrev_b16 v2, v2, v10
+; GFX10-NEXT:    v_and_b32_e32 v5, 7, v5
+; GFX10-NEXT:    v_lshlrev_b16 v6, 1, v6
+; GFX10-NEXT:    v_and_b32_e32 v7, 7, v12
+; GFX10-NEXT:    v_mov_b32_e32 v8, 8
 ; GFX10-NEXT:    v_or_b32_e32 v1, v4, v1
-; GFX10-NEXT:    v_or_b32_e32 v4, v6, v7
 ; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b16 v2, v5, v6
+; GFX10-NEXT:    v_lshrrev_b16 v4, v7, v9
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX10-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v2
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v3
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_fshr_v4i8:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 8, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v0
+; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 8, v1
+; GFX11-NEXT:    v_not_b32_e32 v9, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
-; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX11-NEXT:    v_not_b32_e32 v12, v7
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 24, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 16, v1
+; GFX11-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX11-NEXT:    v_and_b32_e32 v9, 7, v9
+; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
 ; GFX11-NEXT:    v_and_b32_e32 v12, 7, v12
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 1, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX11-NEXT:    v_not_b32_e32 v14, v11
-; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v6
-; GFX11-NEXT:    v_not_b32_e32 v7, v13
-; GFX11-NEXT:    v_lshrrev_b32_e32 v9, 24, v1
-; GFX11-NEXT:    v_not_b32_e32 v10, v2
-; GFX11-NEXT:    v_lshlrev_b16 v3, v12, v3
-; GFX11-NEXT:    v_and_b32_e32 v11, 7, v11
-; GFX11-NEXT:    v_and_b32_e32 v12, 7, v14
+; GFX11-NEXT:    v_lshlrev_b16 v0, v9, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v4, 1, v4
+; GFX11-NEXT:    v_lshlrev_b16 v3, v12, v3
+; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v6
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v11
+; GFX11-NEXT:    v_and_b32_e32 v9, 7, v14
 ; GFX11-NEXT:    v_and_b32_e32 v8, 0xff, v8
-; GFX11-NEXT:    v_and_b32_e32 v7, 7, v7
-; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
-; GFX11-NEXT:    v_and_b32_e32 v13, 7, v13
+; GFX11-NEXT:    v_not_b32_e32 v11, v13
+; GFX11-NEXT:    v_lshrrev_b32_e32 v10, 24, v1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 7, v2
-; GFX11-NEXT:    v_and_b32_e32 v10, 7, v10
-; GFX11-NEXT:    v_lshlrev_b16 v0, 1, v0
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_or_b32_e32 v3, v3, v6
-; GFX11-NEXT:    v_lshlrev_b16 v4, v12, v4
-; GFX11-NEXT:    v_lshrrev_b16 v6, v11, v8
-; GFX11-NEXT:    v_lshlrev_b16 v5, v7, v5
-; GFX11-NEXT:    v_lshrrev_b16 v7, v13, v9
-; GFX11-NEXT:    v_lshlrev_b16 v0, v10, v0
+; GFX11-NEXT:    v_lshlrev_b16 v4, v9, v4
+; GFX11-NEXT:    v_lshrrev_b16 v6, v7, v8
+; GFX11-NEXT:    v_and_b32_e32 v7, 7, v11
+; GFX11-NEXT:    v_lshlrev_b16 v5, 1, v5
+; GFX11-NEXT:    v_and_b32_e32 v8, 7, v13
 ; GFX11-NEXT:    v_lshrrev_b16 v1, v2, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-NEXT:    v_or_b32_e32 v3, v4, v6
-; GFX11-NEXT:    v_or_b32_e32 v4, v5, v7
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_or_b32_e32 v2, v4, v6
+; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_lshlrev_b16 v4, v7, v5
+; GFX11-NEXT:    v_lshrrev_b16 v5, v8, v10
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v3, v4, v5
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %lhs = bitcast i32 %lhs.arg to <4 x i8>
@@ -1922,8 +1896,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v1, v2, v1
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 24, v0
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_bfe_u32 v2, v0, 8, 8
 ; GFX6-NEXT:    v_bfe_u32 v0, v0, 16, 8
@@ -1936,19 +1909,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-LABEL: s_fshr_v2i24:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX8-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX8-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX8-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX8-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX8-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX8-NEXT:    s_or_b32 s1, s8, s1
-; GFX8-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX8-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX8-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX8-NEXT:    s_or_b32 s1, s8, s1
+; GFX8-NEXT:    s_lshr_b32 s8, s2, 8
+; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX8-NEXT:    s_or_b32 s0, s0, s6
 ; GFX8-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX8-NEXT:    s_and_b32 s7, s9, 0xff
@@ -1956,27 +1927,24 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX8-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX8-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
 ; GFX8-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX8-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX8-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX8-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX8-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX8-NEXT:    s_or_b32 s2, s2, s8
 ; GFX8-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX8-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_or_b32 s3, s10, s3
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX8-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX8-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX8-NEXT:    s_or_b32 s3, s3, s8
 ; GFX8-NEXT:    s_lshr_b32 s8, s4, 8
-; GFX8-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX8-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX8-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX8-NEXT:    s_and_b32 s4, s4, 0xff
@@ -1991,10 +1959,9 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, v0, v1
 ; GFX8-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX8-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX8-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_and_b32 s8, s11, 0xff
+; GFX8-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX8-NEXT:    s_or_b32 s5, s10, s5
 ; GFX8-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX8-NEXT:    s_and_b32 s5, 0xffff, s5
@@ -2044,8 +2011,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX8-NEXT:    v_or_b32_sdwa v3, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v3, v1
-; GFX8-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX8-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
 ; GFX8-NEXT:    v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX8-NEXT:    v_or_b32_e32 v1, v1, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
@@ -2056,19 +2022,17 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-LABEL: s_fshr_v2i24:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
-; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX9-NEXT:    s_and_b32 s1, s1, 0xff
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX9-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX9-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX9-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX9-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX9-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX9-NEXT:    s_or_b32 s1, s8, s1
-; GFX9-NEXT:    s_lshr_b32 s8, s2, 8
 ; GFX9-NEXT:    s_lshr_b32 s7, s0, 16
 ; GFX9-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX9-NEXT:    s_and_b32 s8, s8, 0xff
+; GFX9-NEXT:    s_or_b32 s1, s8, s1
+; GFX9-NEXT:    s_lshr_b32 s8, s2, 8
+; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX9-NEXT:    s_or_b32 s0, s0, s6
 ; GFX9-NEXT:    s_and_b32 s6, s7, 0xff
 ; GFX9-NEXT:    s_and_b32 s7, s9, 0xff
@@ -2076,27 +2040,24 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    s_lshr_b32 s10, s2, 24
 ; GFX9-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 8
-; GFX9-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_or_b32 s2, s2, s8
 ; GFX9-NEXT:    s_and_b32 s8, s9, 0xff
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX9-NEXT:    s_and_b32 s8, 0xffff, s8
 ; GFX9-NEXT:    s_lshr_b32 s11, s3, 8
 ; GFX9-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
-; GFX9-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
 ; GFX9-NEXT:    s_or_b32 s2, s2, s8
 ; GFX9-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX9-NEXT:    s_and_b32 s8, s11, 0xff
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xffffffe8
+; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    s_or_b32 s3, s10, s3
 ; GFX9-NEXT:    s_and_b32 s8, 0xffff, s8
-; GFX9-NEXT:    v_mul_lo_u32 v1, v0, v1
 ; GFX9-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX9-NEXT:    s_lshl_b32 s8, s8, 16
 ; GFX9-NEXT:    s_or_b32 s3, s3, s8
 ; GFX9-NEXT:    s_lshr_b32 s8, s4, 8
-; GFX9-NEXT:    s_and_b32 s8, s8, 0xff
 ; GFX9-NEXT:    s_lshr_b32 s9, s4, 16
 ; GFX9-NEXT:    s_lshr_b32 s10, s4, 24
 ; GFX9-NEXT:    s_and_b32 s4, s4, 0xff
@@ -2111,7 +2072,6 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_add_u32_e32 v0, v0, v1
 ; GFX9-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX9-NEXT:    s_lshr_b32 s11, s5, 8
-; GFX9-NEXT:    s_and_b32 s5, s5, 0xff
 ; GFX9-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX9-NEXT:    s_and_b32 s8, s11, 0xff
 ; GFX9-NEXT:    s_or_b32 s5, s10, s5
@@ -2162,8 +2122,7 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX9-NEXT:    v_and_or_b32 v2, v1, v2, v3
 ; GFX9-NEXT:    v_mov_b32_e32 v3, 16
 ; GFX9-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX9-NEXT:    v_and_b32_e32 v3, 0xff, v0
-; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX9-NEXT:    v_lshlrev_b32_e32 v3, 24, v0
 ; GFX9-NEXT:    v_or3_b32 v1, v2, v1, v3
 ; GFX9-NEXT:    v_bfe_u32 v2, v0, 8, 8
 ; GFX9-NEXT:    v_bfe_u32 v0, v0, 16, 8
@@ -2177,114 +2136,107 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX10-NEXT:    s_lshr_b32 s14, s4, 8
 ; GFX10-NEXT:    s_lshr_b32 s15, s4, 16
-; GFX10-NEXT:    s_and_b32 s14, s14, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s16, s4, 24
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX10-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX10-NEXT:    s_and_b32 s15, 0xffff, s15
+; GFX10-NEXT:    s_and_b32 s15, s15, 0xff
 ; GFX10-NEXT:    s_or_b32 s4, s4, s14
+; GFX10-NEXT:    s_and_b32 s14, 0xffff, s15
 ; GFX10-NEXT:    s_lshr_b32 s17, s5, 8
-; GFX10-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX10-NEXT:    s_lshl_b32 s14, s15, 16
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
 ; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX10-NEXT:    s_or_b32 s5, s16, s5
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX10-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX10-NEXT:    s_and_b32 s15, s17, 0xff
+; GFX10-NEXT:    s_and_b32 s16, s17, 0xff
 ; GFX10-NEXT:    s_or_b32 s4, s4, s14
-; GFX10-NEXT:    s_or_b32 s5, s16, s5
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX10-NEXT:    s_and_b32 s14, 0xffff, s15
+; GFX10-NEXT:    s_and_b32 s15, 0xffff, s16
 ; GFX10-NEXT:    s_and_b32 s5, 0xffff, s5
-; GFX10-NEXT:    s_lshl_b32 s14, s14, 16
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX10-NEXT:    s_lshl_b32 s14, s15, 16
+; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
+; GFX10-NEXT:    s_or_b32 s5, s5, s14
 ; GFX10-NEXT:    s_lshr_b32 s9, s1, 8
 ; GFX10-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
-; GFX10-NEXT:    s_or_b32 s5, s5, s14
-; GFX10-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX10-NEXT:    s_lshr_b32 s10, s2, 8
-; GFX10-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX10-NEXT:    s_lshr_b32 s11, s2, 16
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX10-NEXT:    s_and_b32 s9, s9, 0xff
-; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
-; GFX10-NEXT:    s_and_b32 s10, s10, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s12, s2, 24
 ; GFX10-NEXT:    s_and_b32 s2, s2, 0xff
 ; GFX10-NEXT:    s_or_b32 s1, s8, s1
-; GFX10-NEXT:    s_and_b32 s8, 0xffff, s9
+; GFX10-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX10-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX10-NEXT:    s_lshl_b32 s9, s10, 8
+; GFX10-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
 ; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX10-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v1
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
 ; GFX10-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX10-NEXT:    s_and_b32 s3, s3, 0xff
+; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
+; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX10-NEXT:    s_and_b32 s6, s6, 0xff
-; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX10-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX10-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX10-NEXT:    s_and_b32 s11, s13, 0xff
+; GFX10-NEXT:    s_or_b32 s0, s0, s6
 ; GFX10-NEXT:    s_or_b32 s3, s12, s3
+; GFX10-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
 ; GFX10-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX10-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX10-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX10-NEXT:    s_or_b32 s0, s0, s6
-; GFX10-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX10-NEXT:    s_and_b32 s7, 0xffff, s7
-; GFX10-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX10-NEXT:    s_lshl_b32 s6, s6, 17
+; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX10-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX10-NEXT:    s_or_b32 s0, s6, s0
+; GFX10-NEXT:    s_and_b32 s7, 0xffff, s8
+; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
-; GFX10-NEXT:    s_and_b32 s4, s11, 0xff
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
+; GFX10-NEXT:    s_and_b32 s4, 0xffff, s10
+; GFX10-NEXT:    s_and_b32 s5, 0xffff, s11
+; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
 ; GFX10-NEXT:    s_or_b32 s2, s2, s4
-; GFX10-NEXT:    s_and_b32 s4, s13, 0xff
+; GFX10-NEXT:    s_lshl_b32 s5, s5, 16
+; GFX10-NEXT:    s_lshl_b32 s7, s7, 17
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
-; GFX10-NEXT:    s_lshl_b32 s4, s4, 16
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_mov_b32_e32 v4, 16
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX10-NEXT:    s_or_b32 s3, s3, s4
-; GFX10-NEXT:    s_lshl_b32 s4, s7, 17
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, 24, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX10-NEXT:    s_or_b32 s0, s4, s0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
+; GFX10-NEXT:    v_sub_nc_u32_e32 v2, 23, v1
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xffffff, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
-; GFX10-NEXT:    s_lshl_b32 s2, s8, 17
+; GFX10-NEXT:    s_or_b32 s2, s3, s5
 ; GFX10-NEXT:    v_and_b32_e32 v3, 0xffffff, v3
-; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s3
+; GFX10-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
 ; GFX10-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
-; GFX10-NEXT:    s_or_b32 s0, s2, s1
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 8
+; GFX10-NEXT:    s_or_b32 s0, s7, s1
 ; GFX10-NEXT:    v_lshl_or_b32 v0, s0, v3, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 16
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v2, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-; GFX10-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX10-NEXT:    v_and_or_b32 v2, 0xff, v1, v2
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-; GFX10-NEXT:    v_lshlrev_b32_e32 v3, 24, v4
-; GFX10-NEXT:    v_bfe_u32 v4, v0, 8, 8
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 24, v0
+; GFX10-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX10-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
 ; GFX10-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX10-NEXT:    v_or3_b32 v1, v2, v1, v3
-; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v4
+; GFX10-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX10-NEXT:    v_lshl_or_b32 v0, v0, 8, v5
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX10-NEXT:    v_readfirstlane_b32 s1, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -2294,124 +2246,117 @@ define amdgpu_ps i48 @s_fshr_v2i24(i48 inreg %lhs.arg, i48 inreg %rhs.arg, i48 i
 ; GFX11-NEXT:    v_cvt_f32_ubyte0_e32 v0, 24
 ; GFX11-NEXT:    s_lshr_b32 s14, s4, 8
 ; GFX11-NEXT:    s_lshr_b32 s15, s4, 16
-; GFX11-NEXT:    s_and_b32 s14, s14, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s16, s4, 24
-; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_and_b32 s4, s4, 0xff
-; GFX11-NEXT:    s_and_b32 s15, s15, 0xff
+; GFX11-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX11-NEXT:    s_lshl_b32 s14, s14, 8
-; GFX11-NEXT:    s_and_b32 s15, 0xffff, s15
+; GFX11-NEXT:    s_and_b32 s15, s15, 0xff
 ; GFX11-NEXT:    s_or_b32 s4, s4, s14
+; GFX11-NEXT:    s_and_b32 s14, 0xffff, s15
 ; GFX11-NEXT:    s_lshr_b32 s17, s5, 8
-; GFX11-NEXT:    s_and_b32 s5, s5, 0xff
-; GFX11-NEXT:    s_lshl_b32 s14, s15, 16
 ; GFX11-NEXT:    s_and_b32 s4, 0xffff, s4
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
 ; GFX11-NEXT:    s_lshl_b32 s5, s5, 8
-; GFX11-NEXT:    s_and_b32 s15, s17, 0xff
 ; GFX11-NEXT:    s_or_b32 s4, s4, s14
+; GFX11-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
+; GFX11-NEXT:    s_and_b32 s14, s17, 0xff
 ; GFX11-NEXT:    s_or_b32 s5, s16, s5
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_and_b32 s14, 0xffff, s15
+; GFX11-NEXT:    s_and_b32 s14, 0xffff, s14
 ; GFX11-NEXT:    s_and_b32 s5, 0xffff, s5
+; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX11-NEXT:    s_lshl_b32 s14, s14, 16
-; GFX11-NEXT:    s_lshr_b32 s10, s2, 8
-; GFX11-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
 ; GFX11-NEXT:    s_or_b32 s5, s5, s14
+; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
+; GFX11-NEXT:    v_mul_lo_u32 v1, 0xffffffe8, v0
+; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
 ; GFX11-NEXT:    s_lshr_b32 s9, s1, 8
-; GFX11-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
+; GFX11-NEXT:    s_lshr_b32 s10, s2, 8
 ; GFX11-NEXT:    s_lshr_b32 s11, s2, 16
-; GFX11-NEXT:    s_and_b32 s10, s10, 0xff
-; GFX11-NEXT:    s_lshr_b32 s6, s0, 8
-; GFX11-NEXT:    s_lshr_b32 s8, s0, 24
-; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
+; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX11-NEXT:    s_lshr_b32 s12, s2, 24
+; GFX11-NEXT:    v_mul_hi_u32 v1, v0, v1
 ; GFX11-NEXT:    s_and_b32 s2, s2, 0xff
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 8
-; GFX11-NEXT:    s_and_b32 s9, s9, 0xff
-; GFX11-NEXT:    s_and_b32 s11, s11, 0xff
-; GFX11-NEXT:    s_and_b32 s6, s6, 0xff
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
+; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
 ; GFX11-NEXT:    s_or_b32 s1, s8, s1
+; GFX11-NEXT:    s_and_b32 s8, s9, 0xff
+; GFX11-NEXT:    s_lshl_b32 s9, s10, 8
+; GFX11-NEXT:    s_and_b32 s10, s11, 0xff
 ; GFX11-NEXT:    v_add_nc_u32_e32 v0, v0, v1
-; GFX11-NEXT:    s_and_b32 s8, 0xffff, s9
-; GFX11-NEXT:    s_and_b32 s9, 0xffff, s11
-; GFX11-NEXT:    s_lshr_b32 s7, s0, 16
-; GFX11-NEXT:    s_and_b32 s0, s0, 0xff
+; GFX11-NEXT:    s_or_b32 s0, s0, s6
+; GFX11-NEXT:    s_and_b32 s6, 0xffff, s7
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
+; GFX11-NEXT:    s_and_b32 s7, 0xffff, s10
 ; GFX11-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX11-NEXT:    v_mul_hi_u32 v0, s5, v0
-; GFX11-NEXT:    s_lshl_b32 s6, s6, 8
-; GFX11-NEXT:    s_and_b32 s7, s7, 0xff
-; GFX11-NEXT:    s_or_b32 s0, s0, s6
-; GFX11-NEXT:    s_and_b32 s7, 0xffff, s7
+; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
 ; GFX11-NEXT:    s_and_b32 s0, 0xffff, s0
+; GFX11-NEXT:    s_lshl_b32 s6, s6, 17
+; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
 ; GFX11-NEXT:    s_lshr_b32 s13, s3, 8
+; GFX11-NEXT:    s_or_b32 s0, s6, s0
 ; GFX11-NEXT:    v_mul_lo_u32 v1, v1, 24
 ; GFX11-NEXT:    v_mul_lo_u32 v0, v0, 24
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 1
-; GFX11-NEXT:    s_and_b32 s3, s3, 0xff
-; GFX11-NEXT:    s_and_b32 s13, s13, 0xff
 ; GFX11-NEXT:    s_lshl_b32 s3, s3, 8
-; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s11, s13, 0xff
 ; GFX11-NEXT:    s_or_b32 s3, s12, s3
+; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
+; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
+; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, s4, v1
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v0, s5, v0
-; GFX11-NEXT:    s_lshl_b32 s4, s10, 8
-; GFX11-NEXT:    s_and_b32 s10, 0xffff, s13
-; GFX11-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-NEXT:    s_lshl_b32 s5, s7, 16
+; GFX11-NEXT:    s_and_b32 s4, 0xffff, s8
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v1
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    s_lshl_b32 s4, s9, 16
-; GFX11-NEXT:    s_and_b32 s2, 0xffff, s2
-; GFX11-NEXT:    s_and_b32 s3, 0xffff, s3
-; GFX11-NEXT:    s_or_b32 s2, s2, s4
+; GFX11-NEXT:    s_and_b32 s8, 0xffff, s11
+; GFX11-NEXT:    s_lshl_b32 s4, s4, 17
+; GFX11-NEXT:    s_lshl_b32 s7, s8, 16
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
-; GFX11-NEXT:    s_lshl_b32 s4, s7, 17
-; GFX11-NEXT:    s_lshl_b32 s5, s10, 16
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v3, 24, v1
-; GFX11-NEXT:    s_or_b32 s0, s4, s0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v1
-; GFX11-NEXT:    s_lshl_b32 s1, s1, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v1
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xffffff, v1
 ; GFX11-NEXT:    v_subrev_nc_u32_e32 v2, 24, v0
 ; GFX11-NEXT:    v_cmp_le_u32_e32 vcc_lo, 24, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v1, v1, s2
-; GFX11-NEXT:    s_or_b32 s2, s3, s5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_or_b32 s2, s3, s7
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v3, 23, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_lshl_or_b32 v1, s0, v2, v1
-; GFX11-NEXT:    s_lshl_b32 s0, s8, 17
-; GFX11-NEXT:    s_or_b32 s0, s0, s1
+; GFX11-NEXT:    s_or_b32 s0, s4, s1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffffff, v3
-; GFX11-NEXT:    v_bfe_u32 v3, v1, 16, 8
+; GFX11-NEXT:    v_bfe_u32 v3, v1, 8, 8
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffffff, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e64 v0, v0, s2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, s0, v2, v0
-; GFX11-NEXT:    v_bfe_u32 v2, v1, 8, 8
-; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 8, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v2
-; GFX11-NEXT:    v_bfe_u32 v2, v0, 8, 8
+; GFX11-NEXT:    v_bfe_u32 v2, v1, 16, 8
+; GFX11-NEXT:    v_and_or_b32 v1, 0xff, v1, v3
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 24, v0
+; GFX11-NEXT:    v_bfe_u32 v5, v0, 8, 8
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
 ; GFX11-NEXT:    v_bfe_u32 v0, v0, 16, 8
-; GFX11-NEXT:    v_or3_b32 v1, v1, v3, v4
+; GFX11-NEXT:    v_or3_b32 v1, v1, v2, v4
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v2
+; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 8, v5
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
@@ -4231,7 +4176,6 @@ define amdgpu_ps float @v_fshr_v2i16_vss(<2 x i16> %lhs, <2 x i16> inreg %rhs, <
 define amdgpu_ps i48 @s_fshr_v3i16(<3 x i16> inreg %lhs, <3 x i16> inreg %rhs, <3 x i16> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
@@ -4484,7 +4428,6 @@ define <3 x half> @v_fshr_v3i16(<3 x i16> %lhs, <3 x i16> %rhs, <3 x i16> %amt)
 ; GFX6-LABEL: v_fshr_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v7, 0xffff, v7
 ; GFX6-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v6, v6, v7
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
index d5bfb7faf7fc..adf301a376f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll
@@ -17,7 +17,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -37,7 +36,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s5, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -71,36 +69,33 @@ define amdgpu_ps void @insertelement_s_v2i8_s_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX10-NEXT:    global_load_ushort v0, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s5, 0
-; GFX10-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v0, s4, s0
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s4, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: insertelement_s_v2i8_s_s:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s5, 1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s5, 0
 ; GFX11-NEXT:    global_load_u16 v0, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s5, 0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s4, s0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s5, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, s0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
@@ -125,7 +120,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -143,7 +137,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s3, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -177,36 +170,34 @@ define amdgpu_ps void @insertelement_v_v2i8_s_s(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 1
-; GFX10-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, s0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, s3, 0
-; GFX10-NEXT:    v_and_b32_sdwa v2, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v0, s2, s0
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s2, s0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: insertelement_v_v2i8_s_s:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s3, 1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s3, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s2, s0
-; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s3, 0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, s0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, s3, 1
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s2, s0
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -229,7 +220,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -248,7 +238,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s4, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -281,37 +270,35 @@ define amdgpu_ps void @insertelement_s_v2i8_v_s(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 0
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v0, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v1, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: insertelement_s_v2i8_v_s:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 0
 ; GFX11-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 0
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v1, v0 :: v_dual_and_b32 v1, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s4, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -335,7 +322,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -355,7 +341,6 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v1, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -389,38 +374,35 @@ define amdgpu_ps void @insertelement_s_v2i8_s_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    global_load_ushort v1, v1, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, s4, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e64 v4, v1, s4, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, v1, s4, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_short v[0:1], v2, off
 ; GFX10-NEXT:    s_endpgm
 ;
 ; GFX11-LABEL: insertelement_s_v2i8_s_v:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    global_load_u16 v1, v1, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 8, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, v2, s4, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, v1, s4, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s4, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_cndmask_b32_e64 v0, v2, s4, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -443,7 +425,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v3, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -462,7 +443,6 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v3, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -495,13 +475,12 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX10-NEXT:    global_load_ushort v2, v2, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
 ; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v3, v3, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -511,21 +490,20 @@ define amdgpu_ps void @insertelement_s_v2i8_v_v(ptr addrspace(4) inreg %ptr, i8
 ;
 ; GFX11-LABEL: insertelement_s_v2i8_v_v:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
 ; GFX11-NEXT:    global_load_u16 v2, v2, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 8, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e32 v3, v3, v0, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xff, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    v_cndmask_b32_e32 v2, v2, v0, vcc_lo
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX11-NEXT:    v_cndmask_b32_e32 v0, v3, v0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -548,7 +526,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -566,7 +543,6 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v1, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v2
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v3, v1, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -600,12 +576,11 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v0, s2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -616,20 +591,18 @@ define amdgpu_ps void @insertelement_v_v2i8_s_v(ptr addrspace(1) %ptr, i8 inreg
 ; GFX11-LABEL: insertelement_v_v2i8_s_v:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v2
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, v0, s2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v1, v1, s2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
+; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    global_store_b16 v[0:1], v2, off
 ; GFX11-NEXT:    s_nop 0
@@ -651,7 +624,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -668,7 +640,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 vcc, s2, 1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -701,12 +672,11 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
-; GFX10-NEXT:    v_mov_b32_e32 v3, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -717,16 +687,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_s(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-LABEL: insertelement_v_v2i8_v_s:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 0
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e64 vcc_lo, s2, 1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v2, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
@@ -752,7 +719,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
@@ -769,7 +735,6 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v4, v0, v2, vcc
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v1, v2, vcc
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v2, 8, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, 0
@@ -802,12 +767,11 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    global_load_ushort v0, v[0:1], off
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
-; GFX10-NEXT:    v_and_b32_sdwa v3, v1, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX10-NEXT:    v_cndmask_b32_e32 v2, v0, v2, vcc_lo
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
@@ -818,16 +782,13 @@ define amdgpu_ps void @insertelement_v_v2i8_v_v(ptr addrspace(1) %ptr, i8 %val,
 ; GFX11-LABEL: insertelement_v_v2i8_v_v:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    global_load_u16 v0, v[0:1], off
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e32 v1, v1, v2, vcc_lo
-; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v3
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v2, vcc_lo
-; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_dual_cndmask_b32 v1, v1, v2 :: v_dual_and_b32 v2, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v1
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
index 19b0057d69b6..b5c3367e055d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.1d.d16.ll
@@ -315,7 +315,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xy(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x3 unorm d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
@@ -392,7 +391,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x5 unorm d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
@@ -469,7 +467,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_xw(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x9 unorm d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
@@ -546,7 +543,6 @@ define amdgpu_ps <2 x half> @load_1d_v2f16_yz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:1], v0, s[0:7] dmask:0x6 unorm d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
@@ -623,10 +619,9 @@ define amdgpu_ps <3 x half> @load_1d_v3f16_xyz(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:2], v0, s[0:7] dmask:0x7 unorm d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v3, 0xffff, v1
+; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
-; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-UNPACKED-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-PACKED-LABEL: load_1d_v3f16_xyz:
@@ -702,8 +697,6 @@ define amdgpu_ps <4 x half> @load_1d_v4f16_xyzw(<8 x i32> inreg %rsrc, i32 %s) {
 ; GFX8-UNPACKED-NEXT:    s_mov_b32 s7, s9
 ; GFX8-UNPACKED-NEXT:    image_load v[0:3], v0, s[0:7] dmask:0xf unorm d16
 ; GFX8-UNPACKED-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX8-UNPACKED-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-UNPACKED-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX8-UNPACKED-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
index 47e476de74cf..0881686e27d4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll
@@ -51,13 +51,11 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16(i32 %node_ptr, float %
 ; GFX10-LABEL: image_bvh_intersect_ray_a16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
-; GFX10-NEXT:    v_and_b32_e32 v10, 0xffff, v7
-; GFX10-NEXT:    v_and_b32_e32 v8, 0xffff, v8
-; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v7
 ; GFX10-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX10-NEXT:    v_and_or_b32 v5, 0xffff, v5, v9
+; GFX10-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
 ; GFX10-NEXT:    v_and_or_b32 v6, 0xffff, v6, v10
+; GFX10-NEXT:    v_and_or_b32 v5, 0xffff, v5, v9
 ; GFX10-NEXT:    image_bvh_intersect_ray v[0:3], v[0:7], s[0:3] a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -114,13 +112,11 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16(i64 %node_ptr, float
 ; GFX10-LABEL: image_bvh64_intersect_ray_a16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v6
-; GFX10-NEXT:    v_and_b32_e32 v11, 0xffff, v8
-; GFX10-NEXT:    v_and_b32_e32 v9, 0xffff, v9
-; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
-; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v11
+; GFX10-NEXT:    v_lshlrev_b32_e32 v11, 16, v8
 ; GFX10-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GFX10-NEXT:    v_and_or_b32 v6, 0xffff, v6, v10
+; GFX10-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
 ; GFX10-NEXT:    v_and_or_b32 v7, 0xffff, v7, v11
+; GFX10-NEXT:    v_and_or_b32 v6, 0xffff, v6, v10
 ; GFX10-NEXT:    image_bvh64_intersect_ray v[0:3], v[0:8], s[0:3] a16
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -254,19 +250,17 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1030-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    v_mov_b32_e32 v13, v0
-; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v5
-; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v7
+; GFX1030-NEXT:    v_mov_b32_e32 v14, v1
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v7
 ; GFX1030-NEXT:    v_mov_b32_e32 v15, v2
-; GFX1030-NEXT:    v_and_b32_e32 v2, 0xffff, v8
 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v3
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v4
-; GFX1030-NEXT:    v_alignbit_b32 v20, v2, v7, 16
+; GFX1030-NEXT:    v_and_or_b32 v19, 0xffff, v6, v1
+; GFX1030-NEXT:    v_alignbit_b32 v20, v8, v7, 16
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:    v_and_or_b32 v18, 0xffff, v5, v0
-; GFX1030-NEXT:    v_and_or_b32 v19, 0xffff, v6, v1
 ; GFX1030-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX1030-NEXT:    v_readfirstlane_b32 s5, v10
@@ -297,14 +291,12 @@ define amdgpu_ps <4 x float> @image_bvh_intersect_ray_a16_vgpr_descr(i32 %node_p
 ; GFX1013-LABEL: image_bvh_intersect_ray_a16_vgpr_descr:
 ; GFX1013:       ; %bb.0:
 ; GFX1013-NEXT:    v_lshrrev_b32_e32 v13, 16, v5
-; GFX1013-NEXT:    v_and_b32_e32 v14, 0xffff, v7
-; GFX1013-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v7
+; GFX1013-NEXT:    v_alignbit_b32 v7, v8, v7, 16
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v13, 16, v13
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX1013-NEXT:    v_alignbit_b32 v7, v8, v7, 16
-; GFX1013-NEXT:    v_and_or_b32 v5, 0xffff, v5, v13
 ; GFX1013-NEXT:    v_and_or_b32 v6, 0xffff, v6, v14
+; GFX1013-NEXT:    v_and_or_b32 v5, 0xffff, v5, v13
 ; GFX1013-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v9
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v10
@@ -485,19 +477,17 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1030-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1030:       ; %bb.0:
 ; GFX1030-NEXT:    v_mov_b32_e32 v14, v0
-; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
 ; GFX1030-NEXT:    v_lshrrev_b32_e32 v0, 16, v6
-; GFX1030-NEXT:    v_and_b32_e32 v1, 0xffff, v8
+; GFX1030-NEXT:    v_mov_b32_e32 v15, v1
+; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v8
 ; GFX1030-NEXT:    v_mov_b32_e32 v16, v2
-; GFX1030-NEXT:    v_and_b32_e32 v2, 0xffff, v9
 ; GFX1030-NEXT:    v_mov_b32_e32 v17, v3
 ; GFX1030-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
-; GFX1030-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX1030-NEXT:    v_mov_b32_e32 v18, v4
 ; GFX1030-NEXT:    v_mov_b32_e32 v19, v5
-; GFX1030-NEXT:    v_alignbit_b32 v22, v2, v8, 16
-; GFX1030-NEXT:    v_and_or_b32 v20, 0xffff, v6, v0
 ; GFX1030-NEXT:    v_and_or_b32 v21, 0xffff, v7, v1
+; GFX1030-NEXT:    v_alignbit_b32 v22, v9, v8, 16
+; GFX1030-NEXT:    v_and_or_b32 v20, 0xffff, v6, v0
 ; GFX1030-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1030-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1030-NEXT:    v_readfirstlane_b32 s4, v10
@@ -530,14 +520,12 @@ define amdgpu_ps <4 x float> @image_bvh64_intersect_ray_a16_vgpr_descr(i64 %node
 ; GFX1013-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr:
 ; GFX1013:       ; %bb.0:
 ; GFX1013-NEXT:    v_lshrrev_b32_e32 v14, 16, v6
-; GFX1013-NEXT:    v_and_b32_e32 v15, 0xffff, v8
-; GFX1013-NEXT:    v_and_b32_e32 v9, 0xffff, v9
+; GFX1013-NEXT:    v_lshlrev_b32_e32 v15, 16, v8
+; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
 ; GFX1013-NEXT:    s_mov_b32 s1, exec_lo
 ; GFX1013-NEXT:    v_lshlrev_b32_e32 v14, 16, v14
-; GFX1013-NEXT:    v_lshlrev_b32_e32 v15, 16, v15
-; GFX1013-NEXT:    v_alignbit_b32 v8, v9, v8, 16
-; GFX1013-NEXT:    v_and_or_b32 v6, 0xffff, v6, v14
 ; GFX1013-NEXT:    v_and_or_b32 v7, 0xffff, v7, v15
+; GFX1013-NEXT:    v_and_or_b32 v6, 0xffff, v6, v14
 ; GFX1013-NEXT:  .LBB9_1: ; =>This Inner Loop Header: Depth=1
 ; GFX1013-NEXT:    v_readfirstlane_b32 s4, v10
 ; GFX1013-NEXT:    v_readfirstlane_b32 s5, v11
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
index 36d5e914d40b..af695287991c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.buffer.load.format.f16.ll
@@ -86,11 +86,9 @@ define amdgpu_ps <2 x half> @raw_buffer_load_format_v2f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -157,19 +155,15 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
@@ -391,19 +385,15 @@ define amdgpu_ps <4 x half> @raw_buffer_load_format_v4f16__sgpr_rsrc__vgpr_voffs
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll
index 5b19b1c913a9..29acd4277e7b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.buffer.load.format.f16.ll
@@ -70,11 +70,9 @@ define amdgpu_ps <2 x half> @raw_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgpr_v
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -126,19 +124,15 @@ define amdgpu_ps <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_v
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
@@ -291,19 +285,15 @@ define amdgpu_ps <4 x half> @raw_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgpr_v
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll
index a799e203d643..54fe9c383e65 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.ptr.tbuffer.load.f16.ll
@@ -54,11 +54,9 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -107,19 +105,15 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
index 1cfb15391be3..4c8e2591b14c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.raw.tbuffer.load.f16.ll
@@ -70,11 +70,9 @@ define amdgpu_ps <2 x half> @raw_tbuffer_load_v2f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY8]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY9]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -138,19 +136,15 @@ define amdgpu_ps <4 x half> @raw_tbuffer_load_v4f16__sgpr_rsrc__vgpr_voffset__sg
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY6]], [[COPY10]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[COPY7]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY13]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY15]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[COPY9]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
index 06560afee3c9..ad7e5651c33e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sdot4.ll
@@ -46,16 +46,14 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_or_b32 v0, v0, v9, v1
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
 ; GFX906-NEXT:    v_and_or_b32 v1, v4, v9, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v7
 ; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX906-NEXT:    v_dot4_i32_i8 v0, v0, v1, v8
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
@@ -64,18 +62,16 @@ define i32 @v_sdot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 8
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v4, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX10-NEXT:    v_dot4c_i32_i8 v8, v0, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
index 870588014cd2..a68359a95056 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.f16.ll
@@ -78,11 +78,9 @@ define amdgpu_ps <2 x half> @struct_buffer_load_format_v2f16__sgpr_rsrc__vgpr_vi
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -152,19 +150,15 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__sgpr_rsrc__vgpr_vi
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
@@ -272,19 +266,15 @@ define amdgpu_ps <4 x half> @struct_buffer_load_format_v4f16__vpr_rsrc__sgpr_vin
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll
index 6c0319ef570d..352d954c9ea3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.f16.ll
@@ -60,11 +60,9 @@ define amdgpu_ps <2 x half> @struct_ptr_buffer_load_format_v2f16__sgpr_rsrc__vgp
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -117,19 +115,15 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__sgpr_rsrc__vgp
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
@@ -217,19 +211,15 @@ define amdgpu_ps <4 x half> @struct_ptr_buffer_load_format_v4f16__vpr_rsrc__sgpr
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll
index 1a57c2e77bdd..7ecb91e1a9fb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.tbuffer.load.f16.ll
@@ -79,11 +79,9 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -139,19 +137,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
@@ -315,19 +309,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
index f270f87aae66..b6850bb1b2f2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.tbuffer.load.f16.ll
@@ -114,11 +114,9 @@ define amdgpu_ps <2 x half> @struct_tbuffer_load_v2f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY9]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY10]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY11]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY10]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0
@@ -194,19 +192,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__sgpr_rsrc__vgpr_vindex__
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY11:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY7]], [[COPY11]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY8]], [[COPY12]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY13]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY12:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY12]], [[COPY8]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY14]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY15:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY10]], [[COPY15]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY16:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY16]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY13:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY9]], [[COPY13]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY14]], [[COPY10]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
@@ -445,19 +439,15 @@ define amdgpu_ps <4 x half> @struct_tbuffer_load_v4f16__vgpr_rsrc__sgpr_vindex__
   ; UNPACKED-NEXT:   [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
   ; UNPACKED-NEXT:   [[COPY17:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
   ; UNPACKED-NEXT:   [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY13]], [[COPY17]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY14]], [[COPY18]], implicit $exec
   ; UNPACKED-NEXT:   [[S_MOV_B32_1:%[0-9]+]]:sreg_32 = S_MOV_B32 16
-  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY19]], [[V_AND_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY18:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY18]], [[COPY14]], implicit $exec
   ; UNPACKED-NEXT:   [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_]], [[V_LSHLREV_B32_e64_]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_2:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY20]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY21:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
-  ; UNPACKED-NEXT:   [[V_AND_B32_e64_3:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY16]], [[COPY21]], implicit $exec
-  ; UNPACKED-NEXT:   [[COPY22:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
-  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY22]], [[V_AND_B32_e64_3]], implicit $exec
-  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_2]], [[V_LSHLREV_B32_e64_1]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY19:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_]]
+  ; UNPACKED-NEXT:   [[V_AND_B32_e64_1:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[COPY15]], [[COPY19]], implicit $exec
+  ; UNPACKED-NEXT:   [[COPY20:%[0-9]+]]:vgpr_32 = COPY [[S_MOV_B32_1]]
+  ; UNPACKED-NEXT:   [[V_LSHLREV_B32_e64_1:%[0-9]+]]:vgpr_32 = V_LSHLREV_B32_e64 [[COPY20]], [[COPY16]], implicit $exec
+  ; UNPACKED-NEXT:   [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 [[V_AND_B32_e64_1]], [[V_LSHLREV_B32_e64_1]], implicit $exec
   ; UNPACKED-NEXT:   $vgpr0 = COPY [[V_OR_B32_e64_]]
   ; UNPACKED-NEXT:   $vgpr1 = COPY [[V_OR_B32_e64_1]]
   ; UNPACKED-NEXT:   SI_RETURN_TO_EPILOG implicit $vgpr0, implicit $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
index b14af9e043e0..0c061d0aa02d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.udot4.ll
@@ -46,16 +46,14 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_or_b32 v0, v0, v9, v1
 ; GFX906-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v3
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
+; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
 ; GFX906-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX906-NEXT:    v_lshlrev_b32_sdwa v1, v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX906-NEXT:    v_and_b32_e32 v2, 0xff, v6
-; GFX906-NEXT:    v_and_b32_e32 v3, 0xff, v7
 ; GFX906-NEXT:    v_and_or_b32 v1, v4, v9, v1
 ; GFX906-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
+; GFX906-NEXT:    v_lshlrev_b32_e32 v3, 24, v7
 ; GFX906-NEXT:    v_or3_b32 v1, v1, v2, v3
 ; GFX906-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
@@ -64,18 +62,16 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_mov_b32_e32 v9, 8
+; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v2
+; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX10-NEXT:    v_lshlrev_b32_sdwa v1, v9, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+; GFX10-NEXT:    v_lshlrev_b32_sdwa v5, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
 ; GFX10-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX10-NEXT:    v_and_b32_e32 v1, 0xff, v2
-; GFX10-NEXT:    v_and_b32_e32 v2, 0xff, v3
-; GFX10-NEXT:    v_lshlrev_b32_sdwa v3, v9, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-; GFX10-NEXT:    v_and_b32_e32 v5, 0xff, v6
-; GFX10-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v2
-; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v4, v3
-; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v5
-; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX10-NEXT:    v_and_or_b32 v3, 0xff, v4, v5
+; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX10-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
 ; GFX10-NEXT:    v_or3_b32 v0, v0, v1, v2
 ; GFX10-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX10-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
@@ -87,19 +83,17 @@ define i32 @v_udot4_cast_v4i8(<4 x i8> %a, <4 x i8> %b, i32 %c) {
 ; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_and_b32_e32 v5, 0xff, v5
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_and_b32_e32 v3, 0xff, v3
+; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 8, v5
-; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 24, v3
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xff, v0, v1
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v6
-; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v7
-; GFX11-NEXT:    v_and_or_b32 v4, 0xff, v4, v5
-; GFX11-NEXT:    v_or3_b32 v0, v0, v2, v3
-; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v6
-; GFX11-NEXT:    v_or3_b32 v1, v4, v1, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v2
+; GFX11-NEXT:    v_lshlrev_b32_e32 v2, 24, v3
+; GFX11-NEXT:    v_and_or_b32 v3, 0xff, v4, v5
+; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v6
+; GFX11-NEXT:    v_lshlrev_b32_e32 v5, 24, v7
+; GFX11-NEXT:    v_or3_b32 v0, v0, v1, v2
+; GFX11-NEXT:    v_or3_b32 v1, v3, v4, v5
 ; GFX11-NEXT:    v_dot4_u32_u8 v0, v0, v1, v8
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %a.cast = bitcast <4 x i8> %a to i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
index e7119c89ac06..04cea6567439 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/orn2.ll
@@ -687,11 +687,9 @@ define <2 x i16> @v_orn2_v2i16(<2 x i16> %src0, <2 x i16> %src1) {
 define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v3i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s6, s5, s6
 ; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
@@ -740,11 +738,9 @@ define amdgpu_ps i48 @s_orn2_v3i16(<3 x i16> inreg %src0, <3 x i16> inreg %src1)
 define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v3i16_commute:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
-; GFX6-NEXT:    s_and_b32 s3, s3, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s1, 0xffff
 ; GFX6-NEXT:    s_or_b32 s6, s5, s6
 ; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
@@ -793,7 +789,6 @@ define amdgpu_ps i48 @s_orn2_v3i16_commute(<3 x i16> inreg %src0, <3 x i16> inre
 define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3 x i16> inreg %src1) {
 ; GFX6-LABEL: s_orn2_v3i16_multi_use:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_and_b32 s6, s6, 0xffff
 ; GFX6-NEXT:    s_mov_b32 s0, -1
 ; GFX6-NEXT:    s_and_b32 s5, s5, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, 16
@@ -801,9 +796,8 @@ define amdgpu_ps { i48, i48 } @s_orn2_v3i16_multi_use(<3 x i16> inreg %src0, <3
 ; GFX6-NEXT:    s_or_b32 s6, s5, s6
 ; GFX6-NEXT:    s_and_b32 s7, s7, 0xffff
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[0:1]
-; GFX6-NEXT:    s_and_b32 s1, s3, 0xffff
 ; GFX6-NEXT:    s_and_b32 s0, s2, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s4, 0xffff
 ; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
@@ -865,10 +859,8 @@ define <3 x i16> @v_orn2_v3i16(<3 x i16> %src0, <3 x i16> %src1) {
 ; GFX6-LABEL: v_orn2_v3i16:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index a6f9bb7ee055..46dfe75fddc3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -279,9 +279,10 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_max_i16_e32 v2, v4, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v2, v1
 ; GFX8-NEXT:    v_add_u16_e32 v1, v3, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -298,8 +299,8 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -315,9 +316,9 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_i16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -336,7 +337,6 @@ define i16 @v_saddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -411,11 +411,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s4
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_add_i32 s1, s1, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -438,8 +437,8 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_i16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -458,10 +457,10 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_add_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -484,7 +483,6 @@ define amdgpu_ps i16 @s_saddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -555,8 +553,7 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -608,10 +605,11 @@ define i32 @v_saddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 8, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -771,8 +769,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -856,8 +853,7 @@ define amdgpu_ps i32 @s_saddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2828,9 +2824,8 @@ define amdgpu_ps i32 @s_saddsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_max_i32 s2, s4, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s3
 ; GFX6-NEXT:    s_add_i32 s1, s1, s2
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -2909,9 +2904,8 @@ define amdgpu_ps float @saddsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2977,9 +2971,8 @@ define amdgpu_ps float @saddsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v3, s0, v3
 ; GFX6-NEXT:    v_min_i32_e32 v2, v3, v2
 ; GFX6-NEXT:    v_add_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3068,22 +3061,20 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_add_i32_e32 v2, vcc, v2, v4
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_max_i32_e32 v5, 0, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v11, v6
+; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v9, v5
 ; GFX6-NEXT:    v_max_i32_e32 v4, v6, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v5
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3179,19 +3170,17 @@ define amdgpu_ps <2 x i32> @s_saddsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_sub_i32 s6, 0x80000000, s6
 ; GFX6-NEXT:    s_sub_i32 s5, 0x7fffffff, s5
 ; GFX6-NEXT:    s_max_i32 s4, s6, s4
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s5
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_add_i32 s3, s3, s4
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -3349,27 +3338,24 @@ define <3 x float> @v_saddsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX6-NEXT:    v_max_i32_e32 v7, 0, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v15, v8
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v13, v7
 ; GFX6-NEXT:    v_max_i32_e32 v6, v8, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v7
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_add_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3499,27 +3485,24 @@ define amdgpu_ps <3 x i32> @s_saddsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
 ; GFX6-NEXT:    s_max_i32 s7, s5, 0
 ; GFX6-NEXT:    s_sub_i32 s8, 0x80000000, s8
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s7, 0x7fffffff, s7
 ; GFX6-NEXT:    s_max_i32 s6, s8, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s7
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_add_i32 s5, s5, s6
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -3715,37 +3698,33 @@ define <4 x float> @v_saddsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_min_i32_e32 v10, 0, v7
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_add_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_max_i32_e32 v9, 0, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v19, v10
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v17, v9
 ; GFX6-NEXT:    v_max_i32_e32 v8, v10, v8
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v9
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_add_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
+; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3907,37 +3886,33 @@ define amdgpu_ps <4 x i32> @s_saddsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_min_i32 s10, s7, 0
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_add_i32 s6, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
 ; GFX6-NEXT:    s_max_i32 s9, s7, 0
 ; GFX6-NEXT:    s_sub_i32 s10, 0x80000000, s10
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s9, 0x7fffffff, s9
 ; GFX6-NEXT:    s_max_i32 s8, s10, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s9
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_add_i32 s7, s7, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
index b666f4552166..e043823d28de 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll
@@ -2401,7 +2401,6 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2410,8 +2409,7 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_short v[0:1], v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
@@ -2481,25 +2479,21 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v1
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s5, v3
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX9-NEXT:    s_xor_b32 s4, s11, s9
-; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s5, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v1, s4, v1
+; GFX9-NEXT:    s_xor_b32 s4, s11, s9
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
-; GFX9-NEXT:    v_and_b32_e32 v1, 0xff, v1
+; GFX9-NEXT:    v_xor_b32_e32 v1, s4, v1
 ; GFX9-NEXT:    v_subrev_u32_e32 v0, s6, v0
+; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
+; GFX9-NEXT:    v_sub_u32_sdwa v1, v1, s4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_xor_b32_e32 v3, s11, v3
-; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
-; GFX9-NEXT:    v_subrev_u32_e32 v3, s11, v3
+; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
+; GFX9-NEXT:    v_sub_u32_sdwa v3, v3, s11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-NEXT:    v_xor_b32_e32 v2, s10, v2
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v3
-; GFX9-NEXT:    v_subrev_u32_e32 v2, s10, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
-; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2507,17 +2501,18 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_load_dword s0, s[4:5], 0x10
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80018
-; GFX10-NEXT:    s_bfe_i32 s3, s0, 0x80010
-; GFX10-NEXT:    s_ashr_i32 s2, s1, 31
-; GFX10-NEXT:    s_ashr_i32 s8, s3, 31
-; GFX10-NEXT:    s_add_i32 s1, s1, s2
-; GFX10-NEXT:    s_add_i32 s3, s3, s8
-; GFX10-NEXT:    s_xor_b32 s1, s1, s2
-; GFX10-NEXT:    s_xor_b32 s3, s3, s8
+; GFX10-NEXT:    s_bfe_i32 s1, s0, 0x80010
+; GFX10-NEXT:    s_bfe_i32 s2, s0, 0x80018
+; GFX10-NEXT:    s_ashr_i32 s3, s1, 31
+; GFX10-NEXT:    s_ashr_i32 s8, s2, 31
+; GFX10-NEXT:    s_add_i32 s1, s1, s3
+; GFX10-NEXT:    s_add_i32 s2, s2, s8
+; GFX10-NEXT:    s_xor_b32 s1, s1, s3
+; GFX10-NEXT:    s_xor_b32 s2, s2, s8
 ; GFX10-NEXT:    v_cvt_f32_u32_e32 v0, s1
-; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s3
+; GFX10-NEXT:    v_cvt_f32_u32_e32 v1, s2
 ; GFX10-NEXT:    s_sub_i32 s6, 0, s1
+; GFX10-NEXT:    s_sub_i32 s7, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
@@ -2525,15 +2520,14 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s6, v0
-; GFX10-NEXT:    s_sub_i32 s6, 0, s3
-; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX10-NEXT:    s_bfe_i32 s6, s0, 0x80008
-; GFX10-NEXT:    s_sext_i32_i8 s0, s0
+; GFX10-NEXT:    v_mul_lo_u32 v3, s7, v1
+; GFX10-NEXT:    s_sext_i32_i8 s6, s0
+; GFX10-NEXT:    s_bfe_i32 s0, s0, 0x80008
 ; GFX10-NEXT:    s_ashr_i32 s9, s6, 31
 ; GFX10-NEXT:    s_ashr_i32 s10, s0, 31
-; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    s_add_i32 s6, s6, s9
 ; GFX10-NEXT:    s_add_i32 s0, s0, s10
+; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    s_xor_b32 s6, s6, s9
 ; GFX10-NEXT:    s_xor_b32 s0, s0, s10
@@ -2542,46 +2536,43 @@ define amdgpu_kernel void @sdivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX10-NEXT:    v_mul_hi_u32 v1, s0, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s1
+; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s3
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s6, v2
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v3
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
+; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
-; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s1, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s1, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
-; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s3, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s3, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
-; GFX10-NEXT:    s_xor_b32 s1, s9, s2
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
-; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
-; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
-; GFX10-NEXT:    v_xor_b32_e32 v2, s9, v2
+; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v2
+; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s1, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s2, v3
+; GFX10-NEXT:    s_xor_b32 s1, s9, s3
+; GFX10-NEXT:    v_cndmask_b32_e64 v0, v0, v4, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v6, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, v3, v7, vcc_lo
 ; GFX10-NEXT:    s_xor_b32 s0, s10, s8
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX10-NEXT:    v_xor_b32_e32 v0, s1, v0
 ; GFX10-NEXT:    v_xor_b32_e32 v1, s0, v1
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
+; GFX10-NEXT:    v_xor_b32_e32 v2, s9, v2
 ; GFX10-NEXT:    v_xor_b32_e32 v3, s10, v3
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v0, s1, v0
+; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v2, s9, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v1, s0, v1
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v3, s10, v3
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v3, s10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_store_short v1, v0, s[4:5]
 ; GFX10-NEXT:    global_store_short v1, v2, s[6:7]
@@ -2807,16 +2798,14 @@ define amdgpu_kernel void @sdivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s11, v3
 ; GFX8-NEXT:    s_xor_b32 s0, s2, s10
-; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
+; GFX8-NEXT:    v_xor_b32_e32 v1, s0, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v1, vcc, s0, v1
 ; GFX8-NEXT:    v_xor_b32_e32 v3, s2, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_subrev_u32_e32 v3, vcc, s2, v3
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index bac80f0777c0..56561d163bb9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -709,9 +709,8 @@ define <2 x i16> @v_sext_inreg_v2i16_15(<2 x i16> %value) {
 define amdgpu_ps i32 @s_sext_inreg_v2i16_11(<2 x i16> inreg %value) {
 ; GFX6-LABEL: s_sext_inreg_v2i16_11:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s1, s1, 0x50000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x50000
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_bfe_i32 s1, s1, 0x50000
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -781,17 +780,15 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v4i16_3:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 13
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 13
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 13
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 13
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 13
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -837,17 +834,15 @@ define <2 x float> @v_sext_inreg_v4i16_3(<4 x i16> %value) {
 define amdgpu_ps <2 x i32> @s_sext_inreg_v4i16_14(<4 x i16> inreg %value) {
 ; GFX6-LABEL: s_sext_inreg_v4i16_14:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s1, s1, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0x20000
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_bfe_i32 s1, s1, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s2, s2, 0x20000
 ; GFX6-NEXT:    s_bfe_i32 s3, s3, 0x20000
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -953,29 +948,25 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 ; GFX6-LABEL: v_sext_inreg_v8i16_11:
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 5
 ; GFX6-NEXT:    v_bfe_i32 v0, v0, 0, 5
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_bfe_i32 v1, v1, 0, 5
 ; GFX6-NEXT:    v_bfe_i32 v2, v2, 0, 5
 ; GFX6-NEXT:    v_bfe_i32 v3, v3, 0, 5
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 5
 ; GFX6-NEXT:    v_bfe_i32 v5, v5, 0, 5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_bfe_i32 v4, v4, 0, 5
-; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 5
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_bfe_i32 v6, v6, 0, 5
+; GFX6-NEXT:    v_bfe_i32 v7, v7, 0, 5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1039,29 +1030,25 @@ define <4 x float> @v_sext_inreg_v8i16_11(<8 x i16> %value) {
 define amdgpu_ps <4 x i32> @s_sext_inreg_v8i16_5(<8 x i16> inreg %value) {
 ; GFX6-LABEL: s_sext_inreg_v8i16_5:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_bfe_i32 s1, s1, 0xb0000
 ; GFX6-NEXT:    s_bfe_i32 s0, s0, 0xb0000
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_bfe_i32 s1, s1, 0xb0000
 ; GFX6-NEXT:    s_bfe_i32 s2, s2, 0xb0000
 ; GFX6-NEXT:    s_bfe_i32 s3, s3, 0xb0000
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_bfe_i32 s4, s4, 0xb0000
 ; GFX6-NEXT:    s_bfe_i32 s5, s5, 0xb0000
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_bfe_i32 s4, s4, 0xb0000
-; GFX6-NEXT:    s_bfe_i32 s7, s7, 0xb0000
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_bfe_i32 s6, s6, 0xb0000
+; GFX6-NEXT:    s_bfe_i32 s7, s7, 0xb0000
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
index 7ad19a479700..d48fda055059 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll
@@ -10,24 +10,15 @@
 define amdgpu_ps i64 @s_shl_i64_zext_i32(i32 inreg %x) {
 ; GCN-LABEL: s_shl_i64_zext_i32:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    s_andn2_b32 s0, s0, -2.0
 ; GCN-NEXT:    s_lshl_b32 s0, s0, 2
 ; GCN-NEXT:    s_mov_b32 s1, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
-; GFX10-LABEL: s_shl_i64_zext_i32:
-; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_andn2_b32 s0, s0, -2.0
-; GFX10-NEXT:    s_mov_b32 s1, 0
-; GFX10-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX10-NEXT:    ; return to shader part epilog
-;
-; GFX11-LABEL: s_shl_i64_zext_i32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, -2.0
-; GFX11-NEXT:    s_mov_b32 s1, 0
-; GFX11-NEXT:    s_lshl_b32 s0, s0, 2
-; GFX11-NEXT:    ; return to shader part epilog
+; GFX10PLUS-LABEL: s_shl_i64_zext_i32:
+; GFX10PLUS:       ; %bb.0:
+; GFX10PLUS-NEXT:    s_lshl_b32 s0, s0, 2
+; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
+; GFX10PLUS-NEXT:    ; return to shader part epilog
   %and = and i32 %x, 1073741823
   %ext = zext i32 %and to i64
   %shl = shl i64 %ext, 2
@@ -38,7 +29,6 @@ define i64 @v_shl_i64_zext_i32(i32 %x) {
 ; GCN-LABEL: v_shl_i64_zext_i32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
@@ -46,16 +36,14 @@ define i64 @v_shl_i64_zext_i32(i32 %x) {
 ; GFX10-LABEL: v_shl_i64_zext_i32:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    v_and_b32_e32 v0, 0x3fffffff, v0
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX11-LABEL: v_shl_i64_zext_i32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3fffffff, v0
-; GFX11-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %and = and i32 %x, 1073741823
   %ext = zext i32 %and to i64
@@ -553,7 +541,6 @@ define <2 x i64> @v_shl_v2i64_sext_v2i32(<2 x i32> %x) {
 define amdgpu_ps i32 @s_shl_i32_zext_i16(i16 inreg %x) {
 ; GFX7-LABEL: s_shl_i32_zext_i16:
 ; GFX7:       ; %bb.0:
-; GFX7-NEXT:    s_and_b32 s0, s0, 0x3fff
 ; GFX7-NEXT:    s_lshl_b32 s0, s0, 2
 ; GFX7-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX7-NEXT:    ; return to shader part epilog
@@ -585,7 +572,6 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
 ; GFX7-LABEL: v_shl_i32_zext_i16:
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -593,21 +579,18 @@ define i32 @v_shl_i32_zext_i16(i16 %x) {
 ; GFX8-LABEL: v_shl_i32_zext_i16:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX9-LABEL: v_shl_i32_zext_i16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 2, v0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
 ; GFX10PLUS-LABEL: v_shl_i32_zext_i16:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0x3fff, v0
 ; GFX10PLUS-NEXT:    v_lshlrev_b16 v0, 2, v0
 ; GFX10PLUS-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
index c2f911cc4458..ff43ffc0382e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl.ll
@@ -780,9 +780,8 @@ define <2 x i16> @v_shl_v2i16_15(<2 x i16> %value) {
 define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amount) {
 ; GFX6-LABEL: s_shl_v2i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s2
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s3
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -825,11 +824,10 @@ define amdgpu_ps i32 @s_shl_v2i16(<2 x i16> inreg %value, <2 x i16> inreg %amoun
 define amdgpu_ps float @shl_v2i16_sv(<2 x i16> inreg %value, <2 x i16> %amount) {
 ; GFX6-LABEL: shl_v2i16_sv:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
-; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_lshl_b32_e32 v0, s0, v0
+; GFX6-NEXT:    v_lshl_b32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -865,7 +863,6 @@ define amdgpu_ps float @shl_v2i16_vs(<2 x i16> %value, <2 x i16> inreg %amount)
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v0, s0, v0
 ; GFX6-NEXT:    s_and_b32 s0, s1, 0xffff
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, s0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -916,14 +913,12 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v6
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v2, v4, v2
 ; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v3, v4, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -959,17 +954,15 @@ define <2 x float> @v_shl_v4i16(<4 x i16> %value, <4 x i16> %amount) {
 define amdgpu_ps <2 x i32> @s_shl_v4i16(<4 x i16> inreg %value, <4 x i16> inreg %amount) {
 ; GFX6-LABEL: s_shl_v4i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s4
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s5
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s6
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s7
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -1063,7 +1056,6 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v12
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, v8, v4
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v13
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v5, v8, v5
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v14
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1072,17 +1064,14 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 ; GFX6-NEXT:    v_and_b32_e32 v8, 0xffff, v15
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, v8, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1128,29 +1117,25 @@ define <4 x float> @v_shl_v8i16(<8 x i16> %value, <8 x i16> %amount) {
 define amdgpu_ps <4 x i32> @s_shl_v8i16(<8 x i16> inreg %value, <8 x i16> inreg %amount) {
 ; GFX6-LABEL: s_shl_v8i16:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b32 s1, s1, s9
 ; GFX6-NEXT:    s_lshl_b32 s0, s0, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s1, s1, s9
 ; GFX6-NEXT:    s_lshl_b32 s2, s2, s10
 ; GFX6-NEXT:    s_lshl_b32 s3, s3, s11
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s4, s12
 ; GFX6-NEXT:    s_lshl_b32 s5, s5, s13
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s4, s4, s12
-; GFX6-NEXT:    s_lshl_b32 s7, s7, s15
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_lshl_b32 s6, s6, s14
+; GFX6-NEXT:    s_lshl_b32 s7, s7, s15
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 320dfbb4980e..f6ee4b5d022f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -279,9 +279,10 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX8-NEXT:    v_max_i16_e32 v1, v1, v2
 ; GFX8-NEXT:    v_min_i16_e32 v1, v1, v4
 ; GFX8-NEXT:    v_sub_u16_e32 v1, v3, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0xff
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -298,8 +299,8 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -315,9 +316,9 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_i16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -336,7 +337,6 @@ define i16 @v_ssubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -411,11 +411,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_sext_i32_i16 s3, s3
 ; GFX8-NEXT:    s_min_i32 s2, s2, s3
 ; GFX8-NEXT:    s_sub_i32 s1, s1, s2
-; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_sext_i32_i16 s0, s0
-; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
+; GFX8-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX8-NEXT:    s_ashr_i32 s0, s0, 8
-; GFX8-NEXT:    s_and_b32 s1, s1, 0xff
+; GFX8-NEXT:    s_ashr_i32 s1, s1, 8
 ; GFX8-NEXT:    s_and_b32 s0, s0, 0xff
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 8
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
@@ -438,8 +437,8 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_i16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -458,10 +457,10 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_sub_i16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -484,7 +483,6 @@ define amdgpu_ps i16 @s_ssubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_pk_ashrrev_i16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -555,8 +553,7 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 24, v3
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
+; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -608,10 +605,11 @@ define i32 @v_ssubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v1), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v0, sext(v0), v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_ashrrev_i16_e32 v3, 8, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v2), v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, sext(v3), v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -771,8 +769,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 24
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX6-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX6-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -856,8 +853,7 @@ define amdgpu_ps i32 @s_ssubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    s_ashr_i32 s3, s3, 8
 ; GFX8-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
-; GFX8-NEXT:    s_and_b32 s1, s3, 0xff
-; GFX8-NEXT:    s_lshl_b32 s1, s1, 24
+; GFX8-NEXT:    s_lshl_b32 s1, s3, 24
 ; GFX8-NEXT:    s_or_b32 s0, s0, s1
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
@@ -2828,9 +2824,8 @@ define amdgpu_ps i32 @s_ssubsat_v2i16(<2 x i16> inreg %lhs, <2 x i16> inreg %rhs
 ; GFX6-NEXT:    s_max_i32 s2, s3, s2
 ; GFX6-NEXT:    s_min_i32 s2, s2, s4
 ; GFX6-NEXT:    s_sub_i32 s1, s1, s2
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
@@ -2909,9 +2904,8 @@ define amdgpu_ps float @ssubsat_v2i16_sv(<2 x i16> inreg %lhs, <2 x i16> %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v1, s1, v1
 ; GFX6-NEXT:    v_min_i32_e32 v1, s2, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -2977,9 +2971,8 @@ define amdgpu_ps float @ssubsat_v2i16_vs(<2 x i16> %lhs, <2 x i16> inreg %rhs) {
 ; GFX6-NEXT:    v_max_i32_e32 v2, s0, v2
 ; GFX6-NEXT:    v_min_i32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, v1, v2
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
@@ -3069,21 +3062,19 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v9
 ; GFX6-NEXT:    v_min_i32_e32 v6, -1, v3
-; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v11
+; GFX6-NEXT:    v_subrev_i32_e32 v6, vcc, 0x80000000, v6
 ; GFX6-NEXT:    v_max_i32_e32 v4, v5, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v3, vcc, v3, v4
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3179,19 +3170,17 @@ define amdgpu_ps <2 x i32> @s_ssubsat_v4i16(<4 x i16> inreg %lhs, <4 x i16> inre
 ; GFX6-NEXT:    s_min_i32 s6, s3, -1
 ; GFX6-NEXT:    s_sub_i32 s6, s6, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s4, s5, s4
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_min_i32 s4, s4, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s3, s3, s4
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -3349,27 +3338,24 @@ define <3 x float> @v_ssubsat_v6i16(<6 x i16> %lhs, <6 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v13
 ; GFX6-NEXT:    v_min_i32_e32 v8, -1, v5
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v8, vcc, v8, v15
 ; GFX6-NEXT:    v_max_i32_e32 v6, v7, v6
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_min_i32_e32 v6, v6, v8
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_sub_i32_e32 v5, vcc, v5, v6
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3499,27 +3485,24 @@ define amdgpu_ps <3 x i32> @s_ssubsat_v6i16(<6 x i16> inreg %lhs, <6 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s6, s11, 16
 ; GFX6-NEXT:    s_sub_i32 s7, s7, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s8, s5, -1
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s8, s8, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s6, s7, s6
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_min_i32 s6, s6, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_sub_i32 s5, s5, s6
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
@@ -3715,37 +3698,33 @@ define <4 x float> @v_ssubsat_v8i16(<8 x i16> %lhs, <8 x i16> %rhs) {
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_max_i32_e32 v9, -1, v7
-; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v6, vcc, v6, v8
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v8, 16, v15
 ; GFX6-NEXT:    v_sub_i32_e32 v9, vcc, v9, v17
 ; GFX6-NEXT:    v_min_i32_e32 v10, -1, v7
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v0, 16, v0
+; GFX6-NEXT:    v_ashrrev_i32_e32 v1, 16, v1
 ; GFX6-NEXT:    v_sub_i32_e32 v10, vcc, v10, v19
 ; GFX6-NEXT:    v_max_i32_e32 v8, v9, v8
-; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 16, v2
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v3, 16, v3
 ; GFX6-NEXT:    v_min_i32_e32 v8, v8, v10
 ; GFX6-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX6-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v5, 16, v5
 ; GFX6-NEXT:    v_sub_i32_e32 v7, vcc, v7, v8
 ; GFX6-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX6-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX6-NEXT:    v_ashrrev_i32_e32 v4, 16, v4
-; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
-; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v5
+; GFX6-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v6, 16, v6
+; GFX6-NEXT:    v_ashrrev_i32_e32 v7, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX6-NEXT:    v_and_b32_e32 v2, 0xffff, v4
-; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX6-NEXT:    v_and_b32_e32 v4, 0xffff, v7
+; GFX6-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX6-NEXT:    v_and_b32_e32 v3, 0xffff, v6
-; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX6-NEXT:    v_lshlrev_b32_e32 v4, 16, v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -3907,37 +3886,33 @@ define amdgpu_ps <4 x i32> @s_ssubsat_v8i16(<8 x i16> inreg %lhs, <8 x i16> inre
 ; GFX6-NEXT:    s_lshl_b32 s7, s7, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_max_i32 s9, s7, -1
-; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s6, s6, s8
 ; GFX6-NEXT:    s_lshl_b32 s8, s15, 16
 ; GFX6-NEXT:    s_sub_i32 s9, s9, 0x7fffffff
 ; GFX6-NEXT:    s_min_i32 s10, s7, -1
 ; GFX6-NEXT:    s_ashr_i32 s0, s0, 16
+; GFX6-NEXT:    s_ashr_i32 s1, s1, 16
 ; GFX6-NEXT:    s_sub_i32 s10, s10, 0x80000000
 ; GFX6-NEXT:    s_max_i32 s8, s9, s8
-; GFX6-NEXT:    s_and_b32 s1, s1, 0xffff
 ; GFX6-NEXT:    s_ashr_i32 s2, s2, 16
 ; GFX6-NEXT:    s_ashr_i32 s3, s3, 16
 ; GFX6-NEXT:    s_min_i32 s8, s8, s10
 ; GFX6-NEXT:    s_and_b32 s0, s0, 0xffff
 ; GFX6-NEXT:    s_lshl_b32 s1, s1, 16
+; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
 ; GFX6-NEXT:    s_ashr_i32 s5, s5, 16
 ; GFX6-NEXT:    s_sub_i32 s7, s7, s8
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
 ; GFX6-NEXT:    s_and_b32 s1, s2, 0xffff
-; GFX6-NEXT:    s_and_b32 s2, s3, 0xffff
-; GFX6-NEXT:    s_ashr_i32 s4, s4, 16
-; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
-; GFX6-NEXT:    s_lshl_b32 s2, s2, 16
-; GFX6-NEXT:    s_and_b32 s3, s5, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s2, s3, 16
 ; GFX6-NEXT:    s_ashr_i32 s6, s6, 16
+; GFX6-NEXT:    s_ashr_i32 s7, s7, 16
 ; GFX6-NEXT:    s_or_b32 s1, s1, s2
 ; GFX6-NEXT:    s_and_b32 s2, s4, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s3, s3, 16
-; GFX6-NEXT:    s_and_b32 s4, s7, 0xffff
+; GFX6-NEXT:    s_lshl_b32 s3, s5, 16
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
 ; GFX6-NEXT:    s_and_b32 s3, s6, 0xffff
-; GFX6-NEXT:    s_lshl_b32 s4, s4, 16
+; GFX6-NEXT:    s_lshl_b32 s4, s7, 16
 ; GFX6-NEXT:    s_or_b32 s3, s3, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
index 1821d29d4b05..02c671a3ccbf 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll
@@ -225,8 +225,8 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -242,9 +242,9 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_add_u16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -263,7 +263,6 @@ define i16 @v_uaddsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -330,8 +329,8 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_add_u16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -350,10 +349,10 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -376,7 +375,6 @@ define amdgpu_ps i16 @s_uaddsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -452,10 +450,11 @@ define i32 @v_uaddsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -615,23 +614,24 @@ define amdgpu_ps i32 @s_uaddsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    v_add_u16_e64 v0, s0, v0 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_add_u16_e64 v1, s0, v1 clamp
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, 8
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    v_add_u16_e64 v1, s0, v1 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s3, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, 8
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    v_add_u16_e64 v2, s0, v2 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s4, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_add_u16_e64 v3, s0, v3 clamp
 ; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-NEXT:    v_add_u16_e64 v3, s0, v3 clamp
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
index a58397eccaba..4014437a2754 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll
@@ -1931,7 +1931,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 1, v1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
 ; GFX8-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
@@ -1939,8 +1938,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    flat_store_short v[0:1], v4
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xff, v3
-; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX8-NEXT:    v_lshlrev_b16_e32 v0, 8, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s6
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s7
@@ -1996,7 +1994,6 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_add_u32_e32 v4, 1, v0
 ; GFX9-NEXT:    v_cmp_le_u32_e32 vcc, s7, v2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX9-NEXT:    v_subrev_u32_e32 v4, s7, v2
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
@@ -2004,8 +2001,7 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_store_short v1, v0, s[0:1]
-; GFX9-NEXT:    v_and_b32_e32 v0, 0xff, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v0, 8, v2
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_short v1, v0, s[2:3]
 ; GFX9-NEXT:    s_endpgm
@@ -2020,17 +2016,17 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, s1
 ; GFX10-NEXT:    s_sub_i32 s3, 0, s2
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v0, v0
-; GFX10-NEXT:    s_sub_i32 s6, 0, s1
+; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_f32_e32 v0, 0x4f7ffffe, v0
 ; GFX10-NEXT:    v_mul_f32_e32 v1, 0x4f7ffffe, v1
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX10-NEXT:    v_mul_lo_u32 v2, s3, v0
+; GFX10-NEXT:    s_sub_i32 s3, 0, s1
+; GFX10-NEXT:    v_mul_lo_u32 v3, s3, v1
 ; GFX10-NEXT:    s_bfe_u32 s3, s0, 0x80008
 ; GFX10-NEXT:    s_and_b32 s0, s0, 0xff
-; GFX10-NEXT:    v_mul_lo_u32 v3, s6, v1
-; GFX10-NEXT:    s_load_dwordx4 s[4:7], s[4:5], 0x0
 ; GFX10-NEXT:    v_mul_hi_u32 v2, v0, v2
 ; GFX10-NEXT:    v_mul_hi_u32 v3, v1, v3
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, v0, v2
@@ -2040,30 +2036,29 @@ define amdgpu_kernel void @udivrem_v2i8(ptr addrspace(1) %out0, ptr addrspace(1)
 ; GFX10-NEXT:    v_mul_lo_u32 v2, v0, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_mul_lo_u32 v3, v1, s1
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v2, s3, v2
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v3, s0, v3
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
 ; GFX10-NEXT:    v_add_nc_u32_e32 v4, 1, v0
 ; GFX10-NEXT:    v_cmp_le_u32_e32 vcc_lo, s2, v2
-; GFX10-NEXT:    v_subrev_nc_u32_e32 v6, s2, v2
-; GFX10-NEXT:    v_add_nc_u32_e32 v5, 1, v1
+; GFX10-NEXT:    v_subrev_nc_u32_e32 v5, s2, v2
+; GFX10-NEXT:    v_add_nc_u32_e32 v6, 1, v1
 ; GFX10-NEXT:    v_cmp_le_u32_e64 s0, s1, v3
 ; GFX10-NEXT:    v_subrev_nc_u32_e32 v7, s1, v3
 ; GFX10-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
-; GFX10-NEXT:    v_mov_b32_e32 v4, 0xff
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v6, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v5, s0
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, v2, v5, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v1, v6, s0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, v3, v7, s0
-; GFX10-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_and_b32_sdwa v2, v2, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
@@ -2241,12 +2236,10 @@ define amdgpu_kernel void @udivrem_v2i16(ptr addrspace(1) %out0, ptr addrspace(1
 ; GFX8-NEXT:    v_cmp_le_u32_e32 vcc, s3, v3
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v4, vcc
 ; GFX8-NEXT:    v_subrev_u32_e64 v4, s[0:1], s3, v3
-; GFX8-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v4, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX8-NEXT:    v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_and_b32_e32 v0, 0xffff, v3
-; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX8-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
index a60370cd460f..43547d7f3a76 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll
@@ -219,8 +219,8 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX9-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX9-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -236,9 +236,9 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_lshlrev_b16 v1, 8, v1 op_sel_hi:[0,1]
 ; GFX10-NEXT:    v_pk_sub_u16 v0, v0, v1 clamp
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -257,7 +257,6 @@ define i16 @v_usubsat_v2i8(i16 %lhs.arg, i16 %rhs.arg) {
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -322,8 +321,8 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s1
 ; GFX9-NEXT:    v_pk_sub_u16 v0, s0, v0 clamp
 ; GFX9-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX9-NEXT:    v_mov_b32_e32 v1, 0xff
-; GFX9-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX9-NEXT:    ; return to shader part epilog
@@ -342,10 +341,10 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX10-NEXT:    s_lshl_b32 s3, s3, 8
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
 ; GFX10-NEXT:    s_pack_ll_b32_b16 s1, s1, s3
-; GFX10-NEXT:    v_mov_b32_e32 v1, 0xff
+; GFX10-NEXT:    v_mov_b32_e32 v1, 16
 ; GFX10-NEXT:    v_pk_sub_u16 v0, s0, s1 clamp
 ; GFX10-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
-; GFX10-NEXT:    v_and_b32_sdwa v1, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-NEXT:    v_lshrrev_b32_sdwa v1, v1, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX10-NEXT:    ; return to shader part epilog
@@ -368,7 +367,6 @@ define amdgpu_ps i16 @s_usubsat_v2i8(i16 inreg %lhs.arg, i16 inreg %rhs.arg) {
 ; GFX11-NEXT:    v_pk_lshrrev_b16 v0, 8, v0 op_sel_hi:[0,1]
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_and_b32_e32 v1, 0xff, v1
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX11-NEXT:    v_readfirstlane_b32 s0, v0
@@ -440,10 +438,11 @@ define i32 @v_usubsat_v4i8(i32 %lhs.arg, i32 %rhs.arg) {
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -599,23 +598,24 @@ define amdgpu_ps i32 @s_usubsat_v4i8(i32 inreg %lhs.arg, i32 inreg %rhs.arg) {
 ; GFX8-NEXT:    v_sub_u16_e64 v0, s0, v0 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s2, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
-; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v1 clamp
 ; GFX8-NEXT:    s_lshl_b32 s1, s6, 8
-; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
+; GFX8-NEXT:    v_sub_u16_e64 v1, s0, v1 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s3, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s1
 ; GFX8-NEXT:    s_lshl_b32 s1, s7, 8
-; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_mov_b32_e32 v4, 0xff
 ; GFX8-NEXT:    v_sub_u16_e64 v2, s0, v2 clamp
 ; GFX8-NEXT:    s_lshl_b32 s0, s4, 8
 ; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_sub_u16_e64 v3, s0, v3 clamp
 ; GFX8-NEXT:    v_and_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 8, v1
-; GFX8-NEXT:    v_sub_u16_e64 v3, s0, v3 clamp
+; GFX8-NEXT:    v_lshrrev_b16_e32 v3, 8, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_and_b32_sdwa v1, v2, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX8-NEXT:    v_and_b32_sdwa v1, v3, v4 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 24, v3
 ; GFX8-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX8-NEXT:    v_readfirstlane_b32 s0, v0
 ; GFX8-NEXT:    ; return to shader part epilog
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index d9cbbc11f9a7..305108330d19 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4420,9 +4420,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi(ptr addrs
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3]
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
@@ -4460,9 +4459,8 @@ define amdgpu_ps <2 x half> @global_load_saddr_i16_d16hi_sexti8_reg_hi_immneg128
 ; GFX12-GISEL:       ; %bb.0:
 ; GFX12-GISEL-NEXT:    global_load_i8 v0, v0, s[2:3] offset:-128
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 16, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-GISEL-NEXT:    v_and_or_b32 v0, 0xffff, v1, v0
 ; GFX12-GISEL-NEXT:    ; return to shader part epilog
   %zext.offset = zext i32 %voffset to i64
diff --git a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
index 526ee5a51745..942c61f33943 100644
--- a/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
+++ b/llvm/test/CodeGen/AMDGPU/integer-mad-patterns.ll
@@ -382,39 +382,36 @@ define <2 x i16> @clpeak_imad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
 ; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v4, v2
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX67-GISEL-NEXT:    v_mov_b32_e32 v4, 0x10000
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -917,74 +914,67 @@ define <4 x i16> @clpeak_imad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v4i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
 ; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v10, v10, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v8, v8, v4
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v10, v10, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v9, v9, v6
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v11, v11, v7
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v5
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v6
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v3, v3, v7
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v11
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX67-GISEL-NEXT:    v_mov_b32_e32 v7, 0x10000
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v5
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX67-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v9
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX67-GISEL-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v7
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v7, v8, v7
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -1422,39 +1412,36 @@ define <2 x i16> @clpeak_umad_pat_v2i16(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
 ; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v4, v2
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX67-GISEL-NEXT:    v_mov_b32_e32 v4, 0x10000
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -1957,74 +1944,67 @@ define <4 x i16> @clpeak_umad_pat_v4i16(<4 x i16> %x, <4 x i16> %y) {
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v4i16:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
 ; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v2
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v9
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v10, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v8, v8, v9
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v10
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v10, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v9, v9, v10
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v10, 16, v8
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v10, v10, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v8, v8, v4
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v10, v10, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v9, 0xffff, v9
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v9, v9, v6
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v11, v11, v7
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v8, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v10, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v9, v2
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v11, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v1, v1, v2
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v5
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v10
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v4
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v6
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v3, v3, v7
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v9
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v11
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v8
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v10
+; GFX67-GISEL-NEXT:    v_mov_b32_e32 v7, 0x10000
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
-; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v6
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v7
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v6
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v7, vcc, 1, v2
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v2, v2, v5
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v5, vcc, 1, v9
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v6
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v6, 16, v11
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v5
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, v6, v7
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v5, v5, v6
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v6, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v7
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v9, vcc, 1, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v6, 0xffff, v6
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
-; GFX67-GISEL-NEXT:    v_or_b32_e32 v6, v6, v7
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v7, 0xffff, v8
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v9
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v8, 16, v8
-; GFX67-GISEL-NEXT:    v_or_b32_e32 v7, v7, v8
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v9, vcc, v9, v7
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v8, vcc, 1, v1
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v6, v6, v9
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 16, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v8, 0xffff, v8
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v7, vcc, v9, v7
+; GFX67-GISEL-NEXT:    v_or_b32_e32 v7, v8, v7
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
@@ -7029,83 +7009,76 @@ define <2 x i16> @clpeak_imad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-LABEL: clpeak_imad_pat_v2i16_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
 ; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v4, v2
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v0, v4
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v0, v4
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX67-GISEL-NEXT:    v_mov_b32_e32 v4, 0x10000
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -7337,83 +7310,76 @@ define <2 x i16> @clpeak_umad_pat_v2i16_x2(<2 x i16> %x, <2 x i16> %y) {
 ; GFX67-GISEL-LABEL: clpeak_umad_pat_v2i16_x2:
 ; GFX67-GISEL:       ; %bb.0: ; %entry
 ; GFX67-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, 1, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v4, v2
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v5, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v4, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v5, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v0, v0, v1
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v0, v2
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v0, v4
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v5, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
+; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v4, v0, v4
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v5, v1, v5
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, v4, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v5, v3
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v5
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v0, v2, v0
+; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v3, v1
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 1, v4
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
+; GFX67-GISEL-NEXT:    v_mov_b32_e32 v4, 0x10000
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
-; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, 1, v1
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, v3, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 1, v0
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v5, 16, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v4, 16, v4
+; GFX67-GISEL-NEXT:    v_add_i32_e32 v4, vcc, v5, v4
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v3, v3, v4
 ; GFX67-GISEL-NEXT:    v_lshrrev_b32_e32 v4, 16, v2
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
@@ -8017,9 +7983,8 @@ define <2 x i16> @other_use_mul_mad_v2i16_var(<2 x i16> %x, <2 x i16> %y, <2 x i
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v3
 ; GFX67-GISEL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
-; GFX67-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v1
 ; GFX67-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v0
-; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; GFX67-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v1
 ; GFX67-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v0, vcc, v0, v4
 ; GFX67-GISEL-NEXT:    v_add_i32_e32 v1, vcc, v1, v5
diff --git a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 1a55bf608ebf..d8ce36429ca4 100644
--- a/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/llvm/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -1667,7 +1667,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_64(ptr addrspace(1) %out, ptr a
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
@@ -1795,7 +1794,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_7_64(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 7, v2
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 64, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
@@ -1936,7 +1934,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_64_123(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v2, vcc, 64, v2
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0x7b, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
@@ -2199,7 +2196,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_16(ptr addrspace(1) %out, ptr ad
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 16, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
@@ -2321,7 +2317,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_1_0(ptr addrspace(1) %out, ptr a
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0xffffc400, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
@@ -2456,7 +2451,6 @@ define amdgpu_kernel void @v_test_v2i16_x_sub_0_neg1_0(ptr addrspace(1) %out, pt
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_subrev_i32_e32 v3, vcc, 0x4400, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
@@ -2594,10 +2588,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg32_neg32(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffe00000, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -2718,10 +2711,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg32(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffffe0, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffe00000, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -2969,10 +2961,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg16_neg16(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, -16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xfff00000, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3093,10 +3084,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_0_neg16(ptr addrspace(1) %out, ptr
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -16, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xfff00000, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3343,10 +3333,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fpone(ptr addrspace(1) %out, p
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc400, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc400, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xc4000000, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3508,10 +3497,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfpone(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4400, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4400, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x44000000, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3673,10 +3661,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_fptwo(ptr addrspace(1) %out, p
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0x4000, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0x4000, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 2.0, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3801,10 +3788,9 @@ define amdgpu_kernel void @v_test_v2i16_x_add_neg_negfptwo(ptr addrspace(1) %out
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v2
 ; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffc000, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, 0xffffc000, v3
 ; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v3, 0xffff, v3
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; SI-GISEL-NEXT:    v_add_i32_e32 v3, vcc, -2.0, v3
 ; SI-GISEL-NEXT:    v_or_b32_e32 v2, v2, v3
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3926,9 +3912,8 @@ define amdgpu_kernel void @v_test_v2i16_x_add_undef_neg32(ptr addrspace(1) %out,
 ; SI-GISEL-NEXT:    buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
 ; SI-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; SI-GISEL-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffffffe0, v2
-; SI-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v2
 ; SI-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 16, v2
+; SI-GISEL-NEXT:    v_add_i32_e32 v2, vcc, 0xffe00000, v2
 ; SI-GISEL-NEXT:    s_mov_b64 s[2:3], s[6:7]
 ; SI-GISEL-NEXT:    buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
 ; SI-GISEL-NEXT:    s_endpgm