Skip to content

Commit 75b292c

Browse files
committed
[AMDGPU][DAG] Fix insert_vector_elt lowering for 8 bit elements
The bitmask used to extract the bits assumed 16 bit elements and wasn't taking the size of the elements into account. Reviewed By: arsenm Differential Revision: https://reviews.llvm.org/D135156
1 parent b794d72 commit 75b292c

File tree

3 files changed

+13
-12
lines changed

3 files changed

+13
-12
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5733,11 +5733,11 @@ SDValue SITargetLowering::lowerINSERT_VECTOR_ELT(SDValue Op,
57335733

57345734
// Convert vector index to bit-index and get the required bit mask.
57355735
assert(isPowerOf2_32(EltSize));
5736+
const auto EltMask = maskTrailingOnes<uint64_t>(EltSize);
57365737
SDValue ScaleFactor = DAG.getConstant(Log2_32(EltSize), SL, MVT::i32);
57375738
SDValue ScaledIdx = DAG.getNode(ISD::SHL, SL, MVT::i32, Idx, ScaleFactor);
57385739
SDValue BFM = DAG.getNode(ISD::SHL, SL, IntVT,
5739-
DAG.getConstant(0xffff, SL, IntVT),
5740-
ScaledIdx);
5740+
DAG.getConstant(EltMask, SL, IntVT), ScaledIdx);
57415741

57425742
// 1. Create a congruent vector with the target value in each element.
57435743
SDValue ExtVal = DAG.getNode(ISD::BITCAST, SL, IntVT,

llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -447,7 +447,7 @@ define amdgpu_kernel void @byte8_inselt(<8 x i8> addrspace(1)* %out, <8 x i8> %v
447447
; GCN: ; %bb.0: ; %entry
448448
; GCN-NEXT: s_load_dword s6, s[0:1], 0x34
449449
; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24
450-
; GCN-NEXT: s_mov_b64 s[4:5], 0xffff
450+
; GCN-NEXT: s_mov_b64 s[4:5], 0xff
451451
; GCN-NEXT: s_waitcnt lgkmcnt(0)
452452
; GCN-NEXT: s_lshl_b32 s6, s6, 3
453453
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s6

llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1033,7 +1033,7 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
10331033
; SI-NEXT: s_mov_b32 s2, -1
10341034
; SI-NEXT: s_waitcnt lgkmcnt(0)
10351035
; SI-NEXT: s_lshl_b32 s5, s6, 3
1036-
; SI-NEXT: s_lshl_b32 s5, -1, s5
1036+
; SI-NEXT: s_lshl_b32 s5, 0xff, s5
10371037
; SI-NEXT: s_andn2_b32 s4, s4, s5
10381038
; SI-NEXT: s_and_b32 s5, s5, 0x505
10391039
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -1046,14 +1046,15 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %ou
10461046
; VI-NEXT: s_load_dword s6, s[4:5], 0x4c
10471047
; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0
10481048
; VI-NEXT: s_load_dword s4, s[4:5], 0x28
1049+
; VI-NEXT: v_mov_b32_e32 v0, 0xff
10491050
; VI-NEXT: s_mov_b32 s3, 0x1100f000
1050-
; VI-NEXT: s_mov_b32 s2, -1
10511051
; VI-NEXT: s_waitcnt lgkmcnt(0)
10521052
; VI-NEXT: s_lshl_b32 s5, s6, 3
1053-
; VI-NEXT: v_lshlrev_b16_e64 v0, s5, -1
1053+
; VI-NEXT: v_lshlrev_b16_e32 v0, s5, v0
10541054
; VI-NEXT: v_not_b32_e32 v1, v0
10551055
; VI-NEXT: v_and_b32_e32 v1, s4, v1
10561056
; VI-NEXT: v_and_b32_e32 v0, 0x505, v0
1057+
; VI-NEXT: s_mov_b32 s2, -1
10571058
; VI-NEXT: v_or_b32_e32 v0, v0, v1
10581059
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
10591060
; VI-NEXT: s_endpgm
@@ -1074,7 +1075,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
10741075
; SI-NEXT: s_mov_b32 s2, -1
10751076
; SI-NEXT: s_waitcnt lgkmcnt(0)
10761077
; SI-NEXT: s_lshl_b32 s5, s6, 3
1077-
; SI-NEXT: s_lshl_b32 s5, 0xffff, s5
1078+
; SI-NEXT: s_lshl_b32 s5, 0xff, s5
10781079
; SI-NEXT: s_andn2_b32 s4, s4, s5
10791080
; SI-NEXT: s_and_b32 s5, s5, 0x5050505
10801081
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -1094,7 +1095,7 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %ou
10941095
; VI-NEXT: s_mov_b32 s2, -1
10951096
; VI-NEXT: s_waitcnt lgkmcnt(0)
10961097
; VI-NEXT: s_lshl_b32 s5, s6, 3
1097-
; VI-NEXT: s_lshl_b32 s5, 0xffff, s5
1098+
; VI-NEXT: s_lshl_b32 s5, 0xff, s5
10981099
; VI-NEXT: s_andn2_b32 s4, s4, s5
10991100
; VI-NEXT: s_and_b32 s5, s5, 0x5050505
11001101
; VI-NEXT: s_or_b32 s4, s5, s4
@@ -1119,7 +1120,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
11191120
; SI-NEXT: s_mov_b32 s2, -1
11201121
; SI-NEXT: s_waitcnt lgkmcnt(0)
11211122
; SI-NEXT: s_lshl_b32 s5, s6, 3
1122-
; SI-NEXT: s_lshl_b32 s5, 0xffff, s5
1123+
; SI-NEXT: s_lshl_b32 s5, 0xff, s5
11231124
; SI-NEXT: s_andn2_b32 s4, s4, s5
11241125
; SI-NEXT: s_and_b32 s5, s5, 0x5050505
11251126
; SI-NEXT: s_or_b32 s4, s5, s4
@@ -1136,7 +1137,7 @@ define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %ou
11361137
; VI-NEXT: s_mov_b32 s2, -1
11371138
; VI-NEXT: s_waitcnt lgkmcnt(0)
11381139
; VI-NEXT: s_lshl_b32 s5, s6, 3
1139-
; VI-NEXT: s_lshl_b32 s5, 0xffff, s5
1140+
; VI-NEXT: s_lshl_b32 s5, 0xff, s5
11401141
; VI-NEXT: s_andn2_b32 s4, s4, s5
11411142
; VI-NEXT: s_and_b32 s5, s5, 0x5050505
11421143
; VI-NEXT: s_or_b32 s4, s5, s4
@@ -1160,7 +1161,7 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
11601161
; SI-NEXT: s_mov_b32 s5, s1
11611162
; SI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
11621163
; SI-NEXT: s_lshl_b32 s8, s8, 3
1163-
; SI-NEXT: s_mov_b64 s[2:3], 0xffff
1164+
; SI-NEXT: s_mov_b64 s[2:3], 0xff
11641165
; SI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
11651166
; SI-NEXT: s_and_b32 s9, s3, 0x5050505
11661167
; SI-NEXT: s_and_b32 s8, s2, 0x5050505
@@ -1183,7 +1184,7 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %
11831184
; VI-NEXT: s_mov_b32 s5, s1
11841185
; VI-NEXT: s_load_dwordx2 s[0:1], s[2:3], 0x0
11851186
; VI-NEXT: s_lshl_b32 s8, s8, 3
1186-
; VI-NEXT: s_mov_b64 s[2:3], 0xffff
1187+
; VI-NEXT: s_mov_b64 s[2:3], 0xff
11871188
; VI-NEXT: s_lshl_b64 s[2:3], s[2:3], s8
11881189
; VI-NEXT: s_and_b32 s9, s3, 0x5050505
11891190
; VI-NEXT: s_and_b32 s8, s2, 0x5050505

0 commit comments

Comments
 (0)