Skip to content

Commit 70eed33

Browse files
rj-jesuststellar
authored andcommitted
[InstCombine] Do not combine shuffle+bitcast if the bitcast is eliminable. (llvm#135769)
If we are attempting to combine shuffle+bitcast but the bitcast is pairable with a subsequent bitcast, we should not fold the shuffle as doing so can block further simplifications. The motivation for this is a long-standing regression affecting SIMDe on AArch64, introduced indirectly by the AlwaysInliner (1a2e77c). Some reproducers: * https://godbolt.org/z/53qx18s6M * https://godbolt.org/z/o5e43h5M7 (cherry picked from commit c91c3f9)
1 parent 6ddf2e5 commit 70eed33

File tree

2 files changed

+47
-4
lines changed

2 files changed

+47
-4
lines changed

llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3029,10 +3029,18 @@ Instruction *InstCombinerImpl::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
30293029
SmallVector<BitCastInst *, 8> BCs;
30303030
DenseMap<Type *, Value *> NewBCs;
30313031
for (User *U : SVI.users())
3032-
if (BitCastInst *BC = dyn_cast<BitCastInst>(U))
3033-
if (!BC->use_empty())
3034-
// Only visit bitcasts that weren't previously handled.
3035-
BCs.push_back(BC);
3032+
if (BitCastInst *BC = dyn_cast<BitCastInst>(U)) {
3033+
// Only visit bitcasts that weren't previously handled.
3034+
if (BC->use_empty())
3035+
continue;
3036+
// Prefer to combine bitcasts of bitcasts before attempting this fold.
3037+
if (BC->hasOneUse()) {
3038+
auto *BC2 = dyn_cast<BitCastInst>(BC->user_back());
3039+
if (BC2 && isEliminableCastPair(BC, BC2))
3040+
continue;
3041+
}
3042+
BCs.push_back(BC);
3043+
}
30363044
for (BitCastInst *BC : BCs) {
30373045
unsigned BegIdx = Mask.front();
30383046
Type *TgtTy = BC->getDestTy();

llvm/test/Transforms/InstCombine/shufflevec-bitcast.ll

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,3 +235,38 @@ define <3 x i4> @shuf_bitcast_wrong_size(<2 x i8> %v, i8 %x) {
235235
%r = shufflevector <4 x i4> %b, <4 x i4> undef, <3 x i32> <i32 0, i32 1, i32 2>
236236
ret <3 x i4> %r
237237
}
238+
239+
; Negative test - chain of bitcasts.
240+
241+
define <16 x i8> @shuf_bitcast_chain(<8 x i32> %v) {
242+
; CHECK-LABEL: @shuf_bitcast_chain(
243+
; CHECK-NEXT: [[S:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
244+
; CHECK-NEXT: [[C:%.*]] = bitcast <4 x i32> [[S]] to <16 x i8>
245+
; CHECK-NEXT: ret <16 x i8> [[C]]
246+
;
247+
%s = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
248+
%a = bitcast <4 x i32> %s to <2 x i64>
249+
%b = bitcast <2 x i64> %a to i128
250+
%c = bitcast i128 %b to <16 x i8>
251+
ret <16 x i8> %c
252+
}
253+
254+
; Same as above, but showing why it's not feasable to implement the reverse
255+
; fold in VectorCombine (see #136998).
256+
257+
define <4 x i32> @shuf_bitcast_chain_2(<8 x i32> %v) {
258+
; CHECK-LABEL: @shuf_bitcast_chain_2(
259+
; CHECK-NEXT: [[S0:%.*]] = shufflevector <8 x i32> [[V:%.*]], <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
260+
; CHECK-NEXT: [[S1:%.*]] = shufflevector <8 x i32> [[V]], <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
261+
; CHECK-NEXT: [[R:%.*]] = or <4 x i32> [[S0]], [[S1]]
262+
; CHECK-NEXT: ret <4 x i32> [[R]]
263+
;
264+
%s0 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
265+
%s1 = shufflevector <8 x i32> %v, <8 x i32> poison, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
266+
%b0 = bitcast <4 x i32> %s0 to i128
267+
%b1 = bitcast <4 x i32> %s1 to i128
268+
%c0 = bitcast i128 %b0 to <4 x i32>
269+
%c1 = bitcast i128 %b1 to <4 x i32>
270+
%r = or <4 x i32> %c0, %c1
271+
ret <4 x i32> %r
272+
}

0 commit comments

Comments
 (0)