Skip to content
This repository was archived by the owner on Jan 26, 2024. It is now read-only.

Commit a833844

Browse files
committed
[X86] combineMulToPMADDWD - replace ASHR(X,16) -> LSHR(X,16)
If we're using an ashr to sign-extend the entire upper 16 bits of the i32 element, then we can replace with a lshr. The sign bit will be correctly shifted for PMADDWD's implicit sign-extension and the upper 16 bits are zero so the upper i16 sext-multiply is guaranteed to be zero. The lshr also has a better chance of folding with shuffles etc.
1 parent 7cf1fef commit a833844

File tree

2 files changed

+52
-37
lines changed

2 files changed

+52
-37
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -44463,6 +44463,12 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
4446344463
if (Src.getScalarValueSizeInBits() == 16)
4446444464
return DAG.getNode(ISD::ZERO_EXTEND_VECTOR_INREG, SDLoc(N), VT, Src);
4446544465
}
44466+
// Convert VSRAI(Op, 16) to VSRLI(Op, 16).
44467+
if (Op.getOpcode() == X86ISD::VSRAI && Op.getConstantOperandVal(1) == 16 &&
44468+
N->isOnlyUserOf(Op.getNode())) {
44469+
return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, Op.getOperand(0),
44470+
Op.getOperand(1));
44471+
}
4446644472
return SDValue();
4446744473
};
4446844474
SDValue ZeroN0 = GetZeroableOp(N0);

llvm/test/CodeGen/X86/pmulh.ll

+46-37
Original file line numberDiff line numberDiff line change
@@ -118,18 +118,18 @@ define <4 x i16> @ashr_mulhw_v4i16(<4 x i32> %a, <4 x i32> %b) {
118118
;
119119
; SSE41-LABEL: ashr_mulhw_v4i16:
120120
; SSE41: # %bb.0:
121-
; SSE41-NEXT: psrad $16, %xmm0
122-
; SSE41-NEXT: psrad $16, %xmm1
123-
; SSE41-NEXT: pmulld %xmm1, %xmm0
121+
; SSE41-NEXT: psrld $16, %xmm1
122+
; SSE41-NEXT: psrld $16, %xmm0
123+
; SSE41-NEXT: pmaddwd %xmm1, %xmm0
124124
; SSE41-NEXT: psrld $16, %xmm0
125125
; SSE41-NEXT: packusdw %xmm0, %xmm0
126126
; SSE41-NEXT: retq
127127
;
128128
; AVX-LABEL: ashr_mulhw_v4i16:
129129
; AVX: # %bb.0:
130-
; AVX-NEXT: vpsrad $16, %xmm0, %xmm0
131-
; AVX-NEXT: vpsrad $16, %xmm1, %xmm1
132-
; AVX-NEXT: vpmulld %xmm1, %xmm0, %xmm0
130+
; AVX-NEXT: vpsrld $16, %xmm1, %xmm1
131+
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
132+
; AVX-NEXT: vpmaddwd %xmm1, %xmm0, %xmm0
133133
; AVX-NEXT: vpsrld $16, %xmm0, %xmm0
134134
; AVX-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
135135
; AVX-NEXT: retq
@@ -462,49 +462,58 @@ define <16 x i16> @ashr_mulhuw_v16i16(<16 x i32> %a, <16 x i32> %b) {
462462
;
463463
; SSE41-LABEL: ashr_mulhuw_v16i16:
464464
; SSE41: # %bb.0:
465-
; SSE41-NEXT: psrad $16, %xmm3
466-
; SSE41-NEXT: psrad $16, %xmm2
467-
; SSE41-NEXT: psrad $16, %xmm1
468-
; SSE41-NEXT: psrad $16, %xmm0
469-
; SSE41-NEXT: psrad $16, %xmm7
470-
; SSE41-NEXT: pmulld %xmm3, %xmm7
471-
; SSE41-NEXT: psrad $16, %xmm6
472-
; SSE41-NEXT: pmulld %xmm2, %xmm6
473-
; SSE41-NEXT: psrad $16, %xmm5
474-
; SSE41-NEXT: pmulld %xmm1, %xmm5
475-
; SSE41-NEXT: psrad $16, %xmm4
476-
; SSE41-NEXT: pmulld %xmm4, %xmm0
477-
; SSE41-NEXT: psrld $16, %xmm7
478-
; SSE41-NEXT: psrld $16, %xmm6
479-
; SSE41-NEXT: packusdw %xmm7, %xmm6
465+
; SSE41-NEXT: psrld $16, %xmm4
466+
; SSE41-NEXT: psrld $16, %xmm0
467+
; SSE41-NEXT: pmaddwd %xmm4, %xmm0
480468
; SSE41-NEXT: psrld $16, %xmm5
469+
; SSE41-NEXT: psrld $16, %xmm1
470+
; SSE41-NEXT: pmaddwd %xmm5, %xmm1
471+
; SSE41-NEXT: psrld $16, %xmm6
472+
; SSE41-NEXT: psrld $16, %xmm2
473+
; SSE41-NEXT: pmaddwd %xmm6, %xmm2
474+
; SSE41-NEXT: psrld $16, %xmm7
475+
; SSE41-NEXT: psrld $16, %xmm3
476+
; SSE41-NEXT: pmaddwd %xmm7, %xmm3
477+
; SSE41-NEXT: psrld $16, %xmm3
478+
; SSE41-NEXT: psrld $16, %xmm2
479+
; SSE41-NEXT: packusdw %xmm3, %xmm2
480+
; SSE41-NEXT: psrld $16, %xmm1
481481
; SSE41-NEXT: psrld $16, %xmm0
482-
; SSE41-NEXT: packusdw %xmm5, %xmm0
483-
; SSE41-NEXT: movdqa %xmm6, %xmm1
482+
; SSE41-NEXT: packusdw %xmm1, %xmm0
483+
; SSE41-NEXT: movdqa %xmm2, %xmm1
484484
; SSE41-NEXT: retq
485485
;
486486
; AVX2-LABEL: ashr_mulhuw_v16i16:
487487
; AVX2: # %bb.0:
488-
; AVX2-NEXT: vpsrad $16, %ymm1, %ymm1
489-
; AVX2-NEXT: vpsrad $16, %ymm0, %ymm0
490-
; AVX2-NEXT: vpsrad $16, %ymm3, %ymm3
491-
; AVX2-NEXT: vpmulld %ymm3, %ymm1, %ymm1
492-
; AVX2-NEXT: vpsrad $16, %ymm2, %ymm2
493-
; AVX2-NEXT: vpmulld %ymm2, %ymm0, %ymm0
488+
; AVX2-NEXT: vpsrld $16, %ymm2, %ymm2
489+
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
490+
; AVX2-NEXT: vpmaddwd %ymm2, %ymm0, %ymm0
491+
; AVX2-NEXT: vpsrld $16, %ymm3, %ymm2
492+
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
493+
; AVX2-NEXT: vpmaddwd %ymm2, %ymm1, %ymm1
494494
; AVX2-NEXT: vpsrld $16, %ymm1, %ymm1
495495
; AVX2-NEXT: vpsrld $16, %ymm0, %ymm0
496496
; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0
497497
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
498498
; AVX2-NEXT: retq
499499
;
500-
; AVX512-LABEL: ashr_mulhuw_v16i16:
501-
; AVX512: # %bb.0:
502-
; AVX512-NEXT: vpsrad $16, %zmm0, %zmm0
503-
; AVX512-NEXT: vpsrad $16, %zmm1, %zmm1
504-
; AVX512-NEXT: vpmulld %zmm1, %zmm0, %zmm0
505-
; AVX512-NEXT: vpsrld $16, %zmm0, %zmm0
506-
; AVX512-NEXT: vpmovdw %zmm0, %ymm0
507-
; AVX512-NEXT: retq
500+
; AVX512F-LABEL: ashr_mulhuw_v16i16:
501+
; AVX512F: # %bb.0:
502+
; AVX512F-NEXT: vpsrad $16, %zmm0, %zmm0
503+
; AVX512F-NEXT: vpsrad $16, %zmm1, %zmm1
504+
; AVX512F-NEXT: vpmulld %zmm1, %zmm0, %zmm0
505+
; AVX512F-NEXT: vpsrld $16, %zmm0, %zmm0
506+
; AVX512F-NEXT: vpmovdw %zmm0, %ymm0
507+
; AVX512F-NEXT: retq
508+
;
509+
; AVX512BW-LABEL: ashr_mulhuw_v16i16:
510+
; AVX512BW: # %bb.0:
511+
; AVX512BW-NEXT: vpsrld $16, %zmm1, %zmm1
512+
; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
513+
; AVX512BW-NEXT: vpmaddwd %zmm1, %zmm0, %zmm0
514+
; AVX512BW-NEXT: vpsrld $16, %zmm0, %zmm0
515+
; AVX512BW-NEXT: vpmovdw %zmm0, %ymm0
516+
; AVX512BW-NEXT: retq
508517
%a1 = ashr <16 x i32> %a, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
509518
%b1 = ashr <16 x i32> %b, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
510519
%c = mul <16 x i32> %a1, %b1

0 commit comments

Comments
 (0)