diff --git a/llvm/lib/Target/VE/CustomDAG.cpp b/llvm/lib/Target/VE/CustomDAG.cpp index c2c4d88bf799..7a2b246ee743 100644 --- a/llvm/lib/Target/VE/CustomDAG.cpp +++ b/llvm/lib/Target/VE/CustomDAG.cpp @@ -1050,7 +1050,7 @@ SDValue CustomDAG::createUniformConstMask(Packing Packing, unsigned NumElements, if (IsTrue) return Res; - return DAG.getNOT(DL, Res, Res.getValueType()); + return createNot(Res, Res.getValueType()); } SDValue CustomDAG::getConstant(uint64_t Val, EVT VT, bool IsTarget, @@ -1360,7 +1360,7 @@ SDValue CustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, case VEISD::VVP_REDUCE_UMAX: case VEISD::VVP_REDUCE_SMIN: case VEISD::VVP_REDUCE_OR: { - // Mask legalization using vm_popcount + // Mask-out off lanes. if (!isAllTrueMask(Mask)) VectorV = getNode(ISD::AND, Mask.getValueType(), {VectorV, Mask}); @@ -1377,12 +1377,13 @@ SDValue CustomDAG::getLegalReductionOpVVP(unsigned VVPOpcode, EVT ResVT, case VEISD::VVP_REDUCE_SMAX: case VEISD::VVP_REDUCE_MUL: case VEISD::VVP_REDUCE_AND: { - // TODO: Invert and OR the mask, then compare PCVM against AVL. + // Invert and OR the mask + if (!isAllTrueMask(Mask)) { + auto InverseMask = createNot(Mask, Mask.getValueType()); + VectorV = getNode(ISD::OR, Mask.getValueType(), {InverseMask, VectorV}); + } // Mask legalization using vm_popcount - if (!isAllTrueMask(Mask)) - VectorV = getNode(ISD::AND, Mask.getValueType(), {VectorV, Mask}); - auto Pop = createMaskPopcount(VectorV, AVL); auto LegalPop = DAG.getZExtOrTrunc(Pop, DL, MVT::i32); diff --git a/llvm/test/CodeGen/VE/Packed/vp-reduce-v512i1-mask-avl-isel.ll b/llvm/test/CodeGen/VE/Packed/vp-reduce-v512i1-mask-avl-isel.ll index 54a1234b2c39..5f12111b6a6c 100644 --- a/llvm/test/CodeGen/VE/Packed/vp-reduce-v512i1-mask-avl-isel.ll +++ b/llvm/test/CodeGen/VE/Packed/vp-reduce-v512i1-mask-avl-isel.ll @@ -2,15 +2,46 @@ ; RUN: llc -O0 --march=ve -mattr=+packed,+vpu %s -o=/dev/stdout | FileCheck %s define fastcc i1 @test_reduce_and(i1 %s, <512 x i1> %v, <512 x i1> %m, i32 %n) { -; FIXME: The generated code is incorrect! ; CHECK-LABEL: test_reduce_and: ; CHECK: # %bb.0: +; CHECK-NEXT: adds.l %s11, -32, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: svm %s16, %vm4, 0 +; CHECK-NEXT: st %s16, (, %s11) +; CHECK-NEXT: svm %s16, %vm4, 1 +; CHECK-NEXT: st %s16, 8(, %s11) +; CHECK-NEXT: svm %s16, %vm4, 2 +; CHECK-NEXT: st %s16, 16(, %s11) +; CHECK-NEXT: svm %s16, %vm4, 3 +; CHECK-NEXT: st %s16, 24(, %s11) # 32-byte Folded Spill +; CHECK-NEXT: andm %vm4, %vm0, %vm2 +; CHECK-NEXT: ld %s16, (, %s11) +; CHECK-NEXT: lvm %vm2, 0, %s16 +; CHECK-NEXT: ld %s16, 8(, %s11) +; CHECK-NEXT: lvm %vm2, 1, %s16 +; CHECK-NEXT: ld %s16, 16(, %s11) +; CHECK-NEXT: lvm %vm2, 2, %s16 +; CHECK-NEXT: ld %s16, 24(, %s11) # 32-byte Folded Reload +; CHECK-NEXT: lvm %vm2, 3, %s16 ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: or %s2, 0, %s1 -; CHECK-NEXT: andm %vm1, %vm2, %vm4 +; CHECK-NEXT: andm %vm1, %vm0, %vm0 +; CHECK-NEXT: xorm %vm2, %vm2, %vm1 +; CHECK-NEXT: orm %vm2, %vm2, %vm4 ; CHECK-NEXT: lvl %s2 -; CHECK-NEXT: pcvm %s3, %vm1 -; CHECK-NEXT: andm %vm1, %vm3, %vm5 +; CHECK-NEXT: pcvm %s3, %vm2 +; CHECK-NEXT: xorm %vm1, %vm5, %vm1 +; CHECK-NEXT: orm %vm1, %vm1, %vm3 ; CHECK-NEXT: pcvm %s1, %vm1 ; CHECK-NEXT: adds.l %s1, %s1, %s3 ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 @@ -23,6 +54,7 @@ define fastcc i1 @test_reduce_and(i1 %s, <512 x i1> %v, <512 x i1> %m, i32 %n) { ; CHECK-NEXT: # implicit-def: $sx1 ; CHECK-NEXT: or %s1, 0, %s2 ; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: adds.l %s11, 32, %s11 ; CHECK-NEXT: b.l.t (, %s10) %r = call i1 @llvm.vp.reduce.and.v512i1(i1 %s, <512 x i1> %v, <512 x i1> %m, i32 %n) ret i1 %r diff --git a/llvm/test/CodeGen/VE/Vector/vp-reduce-v256i1-mask-avl-isel.ll b/llvm/test/CodeGen/VE/Vector/vp-reduce-v256i1-mask-avl-isel.ll index 8f841a2b4acd..2ebf348b2754 100644 --- a/llvm/test/CodeGen/VE/Vector/vp-reduce-v256i1-mask-avl-isel.ll +++ b/llvm/test/CodeGen/VE/Vector/vp-reduce-v256i1-mask-avl-isel.ll @@ -2,12 +2,42 @@ ; RUN: llc -O0 --march=ve -mattr=-packed,+vpu %s -o=/dev/stdout | FileCheck %s define fastcc i1 @test_reduce_and(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %n) { -; FIXME: The generated code is incorrect! ; CHECK-LABEL: test_reduce_and: ; CHECK: # %bb.0: +; CHECK-NEXT: adds.l %s11, -32, %s11 +; CHECK-NEXT: brge.l.t %s11, %s8, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: ld %s61, 24(, %s14) +; CHECK-NEXT: or %s62, 0, %s0 +; CHECK-NEXT: lea %s63, 315 +; CHECK-NEXT: shm.l %s63, (%s61) +; CHECK-NEXT: shm.l %s8, 8(%s61) +; CHECK-NEXT: shm.l %s11, 16(%s61) +; CHECK-NEXT: monc +; CHECK-NEXT: or %s0, 0, %s62 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: svm %s16, %vm2, 0 +; CHECK-NEXT: st %s16, (, %s11) +; CHECK-NEXT: svm %s16, %vm2, 1 +; CHECK-NEXT: st %s16, 8(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 2 +; CHECK-NEXT: st %s16, 16(, %s11) +; CHECK-NEXT: svm %s16, %vm2, 3 +; CHECK-NEXT: st %s16, 24(, %s11) # 32-byte Folded Spill +; CHECK-NEXT: andm %vm2, %vm0, %vm1 +; CHECK-NEXT: ld %s16, (, %s11) +; CHECK-NEXT: lvm %vm1, 0, %s16 +; CHECK-NEXT: ld %s16, 8(, %s11) +; CHECK-NEXT: lvm %vm1, 1, %s16 +; CHECK-NEXT: ld %s16, 16(, %s11) +; CHECK-NEXT: lvm %vm1, 2, %s16 +; CHECK-NEXT: ld %s16, 24(, %s11) # 32-byte Folded Reload +; CHECK-NEXT: lvm %vm1, 3, %s16 ; CHECK-NEXT: and %s1, %s1, (32)0 ; CHECK-NEXT: or %s2, 0, %s1 -; CHECK-NEXT: andm %vm1, %vm1, %vm2 +; CHECK-NEXT: andm %vm3, %vm0, %vm0 +; CHECK-NEXT: xorm %vm1, %vm1, %vm3 +; CHECK-NEXT: orm %vm1, %vm1, %vm2 ; CHECK-NEXT: lvl %s2 ; CHECK-NEXT: pcvm %s1, %vm1 ; CHECK-NEXT: # kill: def $sw1 killed $sw1 killed $sx1 @@ -20,6 +50,7 @@ define fastcc i1 @test_reduce_and(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %n) { ; CHECK-NEXT: # implicit-def: $sx1 ; CHECK-NEXT: or %s1, 0, %s2 ; CHECK-NEXT: and %s0, %s0, %s1 +; CHECK-NEXT: adds.l %s11, 32, %s11 ; CHECK-NEXT: b.l.t (, %s10) %r = call i1 @llvm.vp.reduce.and.v256i1(i1 %s, <256 x i1> %v, <256 x i1> %m, i32 %n) ret i1 %r