Skip to content

WIP: [VE] fix AND reductions on mask types #83

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: develop
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 7 additions & 6 deletions llvm/lib/Target/VE/CustomDAG.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1050,7 +1050,7 @@ SDValue CustomDAG::createUniformConstMask(Packing Packing, unsigned NumElements,
if (IsTrue)
return Res;

return DAG.getNOT(DL, Res, Res.getValueType());
return createNot(Res, Res.getValueType());
}

SDValue CustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget,
Expand Down Expand Up @@ -1360,7 +1360,7 @@ SDValue CustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT,
case VEISD::VVP_REDUCE_UMAX:
case VEISD::VVP_REDUCE_SMIN:
case VEISD::VVP_REDUCE_OR: {
// Mask legalization using vm_popcount
// Mask-out off lanes.
if (!isAllTrueMask(Mask))
VectorV = getNode(ISD::AND, Mask.getValueType(), {VectorV, Mask});

Expand All @@ -1377,12 +1377,13 @@ SDValue CustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT,
case VEISD::VVP_REDUCE_SMAX:
case VEISD::VVP_REDUCE_MUL:
case VEISD::VVP_REDUCE_AND: {
// TODO: Invert and OR the mask, then compare PCVM against AVL.
// Invert and OR the mask
if (!isAllTrueMask(Mask)) {
auto InverseMask = createNot(Mask, Mask.getValueType());
VectorV = getNode(ISD::OR, Mask.getValueType(), {InverseMask, VectorV});
}

// Mask legalization using vm_popcount
if (!isAllTrueMask(Mask))
VectorV = getNode(ISD::AND, Mask.getValueType(), {VectorV, Mask});

auto Pop = createMaskPopcount(VectorV, AVL);
auto LegalPop = DAG.getZExtOrTrunc(Pop, DL, MVT::i32);

Expand Down
40 changes: 36 additions & 4 deletions llvm/test/CodeGen/VE/Packed/vp-reduce-v512i1-mask-avl-isel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,46 @@
; RUN: llc -O0 --march=ve -mattr=+packed,+vpu %s -o=/dev/stdout | FileCheck %s

define fastcc i1 @test_reduce_and(i1 %s, <512 x i1> %v, <512 x i1> %m, i32 %n) {
; FIXME: The generated code is incorrect!
; CHECK-LABEL: test_reduce_and:
; CHECK: # %bb.0:
; CHECK-NEXT: adds.l %s11, -32, %s11
; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: ld %s61, 24(, %s14)
; CHECK-NEXT: or %s62, 0, %s0
; CHECK-NEXT: lea %s63, 315
; CHECK-NEXT: shm.l %s63, (%s61)
; CHECK-NEXT: shm.l %s8, 8(%s61)
; CHECK-NEXT: shm.l %s11, 16(%s61)
; CHECK-NEXT: monc
; CHECK-NEXT: or %s0, 0, %s62
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: svm %s16, %vm4, 0
; CHECK-NEXT: st %s16, (, %s11)
; CHECK-NEXT: svm %s16, %vm4, 1
; CHECK-NEXT: st %s16, 8(, %s11)
; CHECK-NEXT: svm %s16, %vm4, 2
; CHECK-NEXT: st %s16, 16(, %s11)
; CHECK-NEXT: svm %s16, %vm4, 3
; CHECK-NEXT: st %s16, 24(, %s11) # 32-byte Folded Spill
; CHECK-NEXT: andm %vm4, %vm0, %vm2
; CHECK-NEXT: ld %s16, (, %s11)
; CHECK-NEXT: lvm %vm2, 0, %s16
; CHECK-NEXT: ld %s16, 8(, %s11)
; CHECK-NEXT: lvm %vm2, 1, %s16
; CHECK-NEXT: ld %s16, 16(, %s11)
; CHECK-NEXT: lvm %vm2, 2, %s16
; CHECK-NEXT: ld %s16, 24(, %s11) # 32-byte Folded Reload
; CHECK-NEXT: lvm %vm2, 3, %s16
; CHECK-NEXT: and %s1, %s1, (32)0
; CHECK-NEXT: or %s2, 0, %s1
; CHECK-NEXT: andm %vm1, %vm2, %vm4
; CHECK-NEXT: andm %vm1, %vm0, %vm0
; CHECK-NEXT: xorm %vm2, %vm2, %vm1
; CHECK-NEXT: orm %vm2, %vm2, %vm4
; CHECK-NEXT: lvl %s2
; CHECK-NEXT: pcvm %s3, %vm1
; CHECK-NEXT: andm %vm1, %vm3, %vm5
; CHECK-NEXT: pcvm %s3, %vm2
; CHECK-NEXT: xorm %vm1, %vm5, %vm1
; CHECK-NEXT: orm %vm1, %vm1, %vm3
; CHECK-NEXT: pcvm %s1, %vm1
; CHECK-NEXT: adds.l %s1, %s1, %s3
; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
Expand All @@ -23,6 +54,7 @@ define fastcc i1 @test_reduce_and(i1 %s, <512 x i1> %v, <512 x i1> %m, i32 %n) {
; CHECK-NEXT: # implicit-def: $sx1
; CHECK-NEXT: or %s1, 0, %s2
; CHECK-NEXT: and %s0, %s0, %s1
; CHECK-NEXT: adds.l %s11, 32, %s11
; CHECK-NEXT: b.l.t (, %s10)
%r = call i1 @llvm.vp.reduce.and.v512i1(i1 %s, <512 x i1> %v, <512 x i1> %m, i32 %n)
ret i1 %r
Expand Down
35 changes: 33 additions & 2 deletions llvm/test/CodeGen/VE/Vector/vp-reduce-v256i1-mask-avl-isel.ll
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,42 @@
; RUN: llc -O0 --march=ve -mattr=-packed,+vpu %s -o=/dev/stdout | FileCheck %s

define fastcc i1 @test_reduce_and(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %n) {
; FIXME: The generated code is incorrect!
; CHECK-LABEL: test_reduce_and:
; CHECK: # %bb.0:
; CHECK-NEXT: adds.l %s11, -32, %s11
; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: ld %s61, 24(, %s14)
; CHECK-NEXT: or %s62, 0, %s0
; CHECK-NEXT: lea %s63, 315
; CHECK-NEXT: shm.l %s63, (%s61)
; CHECK-NEXT: shm.l %s8, 8(%s61)
; CHECK-NEXT: shm.l %s11, 16(%s61)
; CHECK-NEXT: monc
; CHECK-NEXT: or %s0, 0, %s62
; CHECK-NEXT: .LBB0_2:
; CHECK-NEXT: svm %s16, %vm2, 0
; CHECK-NEXT: st %s16, (, %s11)
; CHECK-NEXT: svm %s16, %vm2, 1
; CHECK-NEXT: st %s16, 8(, %s11)
; CHECK-NEXT: svm %s16, %vm2, 2
; CHECK-NEXT: st %s16, 16(, %s11)
; CHECK-NEXT: svm %s16, %vm2, 3
; CHECK-NEXT: st %s16, 24(, %s11) # 32-byte Folded Spill
; CHECK-NEXT: andm %vm2, %vm0, %vm1
; CHECK-NEXT: ld %s16, (, %s11)
; CHECK-NEXT: lvm %vm1, 0, %s16
; CHECK-NEXT: ld %s16, 8(, %s11)
; CHECK-NEXT: lvm %vm1, 1, %s16
; CHECK-NEXT: ld %s16, 16(, %s11)
; CHECK-NEXT: lvm %vm1, 2, %s16
; CHECK-NEXT: ld %s16, 24(, %s11) # 32-byte Folded Reload
; CHECK-NEXT: lvm %vm1, 3, %s16
; CHECK-NEXT: and %s1, %s1, (32)0
; CHECK-NEXT: or %s2, 0, %s1
; CHECK-NEXT: andm %vm1, %vm1, %vm2
; CHECK-NEXT: andm %vm3, %vm0, %vm0
; CHECK-NEXT: xorm %vm1, %vm1, %vm3
; CHECK-NEXT: orm %vm1, %vm1, %vm2
; CHECK-NEXT: lvl %s2
; CHECK-NEXT: pcvm %s1, %vm1
; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1
Expand All @@ -20,6 +50,7 @@ define fastcc i1 @test_reduce_and(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %n) {
; CHECK-NEXT: # implicit-def: $sx1
; CHECK-NEXT: or %s1, 0, %s2
; CHECK-NEXT: and %s0, %s0, %s1
; CHECK-NEXT: adds.l %s11, 32, %s11
; CHECK-NEXT: b.l.t (, %s10)
%r = call i1 @llvm.vp.reduce.and.v256i1(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %n)
ret i1 %r
Expand Down