diff --git a/runtime/compiler/p/codegen/J9CodeGenerator.cpp b/runtime/compiler/p/codegen/J9CodeGenerator.cpp index 1a784de9d9b..656c07c227b 100644 --- a/runtime/compiler/p/codegen/J9CodeGenerator.cpp +++ b/runtime/compiler/p/codegen/J9CodeGenerator.cpp @@ -133,6 +133,20 @@ J9::Power::CodeGenerator::initialize() cg->setEnableTLHPrefetching(); } + static bool disableInlineStringCodingHasNegatives = + feGetEnv("TR_DisableInlineStringCodingHasNegatives") != NULL; + static bool disableInlineStringCodingCountPositives = + feGetEnv("TR_DisableInlineStringCodingCountPositives") != NULL; + if (comp->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8) && + comp->target().cpu.supportsFeature(OMR_FEATURE_PPC_HAS_VSX) && + !TR::Compiler->om.canGenerateArraylets()) + { + if (!disableInlineStringCodingHasNegatives) + cg->setSupportsInlineStringCodingHasNegatives(); + if (!disableInlineStringCodingCountPositives) + cg->setSupportsInlineStringCodingCountPositives(); + } + //This env-var does 3 things: // 1. Prevents batch clear in frontend/j9/rossa.cpp // 2. Prevents all allocations to nonZeroTLH diff --git a/runtime/compiler/p/codegen/J9TreeEvaluator.cpp b/runtime/compiler/p/codegen/J9TreeEvaluator.cpp index 04f5a9b5531..4958da75f13 100644 --- a/runtime/compiler/p/codegen/J9TreeEvaluator.cpp +++ b/runtime/compiler/p/codegen/J9TreeEvaluator.cpp @@ -11693,6 +11693,277 @@ static bool inlineIntrinsicInflate(TR::Node *node, TR::CodeGenerator *cg) return true; } +static TR::Register *inlineStringCodingHasNegativesOrCountPositives(TR::Node *node, + TR::CodeGenerator *cg, + bool isCountPositives) + { + TR::Compilation *comp = cg->comp(); + bool isLE = comp->target().cpu.isLittleEndian(); + + TR::Register *startReg = cg->gprClobberEvaluate(node->getChild(0)); // array + TR::Register *indexReg = cg->gprClobberEvaluate(node->getChild(1)); // offset + TR::Register *lengthReg = cg->evaluate(node->getChild(2)); // length + + TR::Register *tempReg = cg->allocateRegister(); + + TR::Register *cr6 = cg->allocateRegister(TR_CCR); + TR::Register *cr0 = NULL; + if (isCountPositives && isLE) + cr0 = cg->allocateRegister(TR_CCR); + + TR::Register *vconstant0Reg = cg->allocateRegister(TR_VRF); + TR::Register *vtmp1Reg = cg->allocateRegister(TR_VRF); + TR::Register *vtmp2Reg = cg->allocateRegister(TR_VRF); + + TR::Register *storeReg = cg->allocateRegister(); + TR::Register *maskReg = cg->allocateRegister(); + + TR::LabelSymbol *VSXLabel = generateLabelSymbol(cg); + TR::LabelSymbol *serialPrepLabel = generateLabelSymbol(cg); + TR::LabelSymbol *serialUnrollLabel = generateLabelSymbol(cg); + TR::LabelSymbol *serialLabel = generateLabelSymbol(cg); + TR::LabelSymbol *vecResultLabel = generateLabelSymbol(cg); + TR::LabelSymbol *resultLabel = generateLabelSymbol(cg); + TR::LabelSymbol *endLabel = generateLabelSymbol(cg); + + // check empty + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, lengthReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::ble, node, resultLabel, cr6); + + // skip over or load the header +#if defined(J9VM_GC_SPARSE_HEAP_ALLOCATION) + if (TR::Compiler->om.isOffHeapAllocationEnabled()) + { + generateTrg1MemInstruction( + cg, TR::InstOpCode::ld, node, startReg, + TR::MemoryReference::createWithDisplacement( + cg, startReg, TR::Compiler->om.offsetOfContiguousDataAddrField(), 8) + ); + } + else +#endif /* J9VM_GC_SPARSE_HEAP_ALLOCATION */ + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, startReg, startReg, + TR::Compiler->om.contiguousArrayHeaderSizeInBytes()); + } + + // get the starting address + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, startReg, startReg, indexReg); + // make the index 0 since everything we need is relative to the offset + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, indexReg, 0); + + // check the first byte + generateTrg1MemInstruction(cg, TR::InstOpCode::lbzx, node, tempReg, + TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 1)); + generateTrg1Src1Instruction(cg, TR::InstOpCode::extsb, node, tempReg, tempReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, tempReg, 0); + // when seeking negatives, we need to return 1 + if (!isCountPositives) + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6); + // if we only have one byte end it here, and return 0 for hasNegative + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1); + generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, lengthReg); + if (!isCountPositives) + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 0); + generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, endLabel, cr6); + + // ready the zero reg + generateTrg1Src2Instruction(cg, TR::InstOpCode::vxor, node, vconstant0Reg, vconstant0Reg, vconstant0Reg); + // tempReg marks the end where we could use lxv + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, tempReg, lengthReg, -15); + + // --- start of VSXLoop + generateLabelInstruction(cg, TR::InstOpCode::label, node, VSXLabel); + // go to residue if we don't have enough items to do one load + generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, tempReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, serialPrepLabel, cr6); + + // load 16 items + generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvw4x, node, vtmp1Reg, startReg, indexReg); + if (isLE) + { + // swap around the shorts in each word; we need 2 instructions to load 16 + generateTrg1ImmInstruction(cg, TR::InstOpCode::vspltisw, node, vtmp2Reg, 8); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vadduwm, node, vtmp2Reg, vtmp2Reg, vtmp2Reg); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vrlw, node, vtmp1Reg, vtmp1Reg, vtmp2Reg); + // then swap around the bytes in each short + generateTrg1ImmInstruction(cg, TR::InstOpCode::vspltish, node, vtmp2Reg, 8); + generateTrg1Src2Instruction(cg, TR::InstOpCode::vrlh, node, vtmp1Reg, vtmp1Reg, vtmp2Reg); + } + // bit 2 of cr6 (ZERO) will not be set if any comparison is true + generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmpgtsb_r, node, vtmp1Reg, vconstant0Reg, vtmp1Reg); + // branch when the ZERO bit is not set + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, vecResultLabel, cr6); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 16); + generateLabelInstruction(cg, TR::InstOpCode::b, node, VSXLabel); + + // --- this label is only used when we exit from the VSXLoop + generateLabelInstruction(cg, TR::InstOpCode::label, node, vecResultLabel); + if (isCountPositives) // jump to the serial label which sould soon count to the value we want + { + //generateTrg1Src1Instruction(cg, TR::InstOpCode::vclzlsbb, node, returnReg, vtmp1Reg); + //generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, returnReg, returnReg, indexReg); + generateLabelInstruction(cg, TR::InstOpCode::b, node, serialPrepLabel); + } + else // just report 1 + { + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1); + generateLabelInstruction(cg, TR::InstOpCode::b, node, endLabel); + } + + // --- serialPrepLabel to deal with whatever remains + generateLabelInstruction(cg, TR::InstOpCode::label, node, serialPrepLabel); + // do we have enough elements to use the unroll loop? + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, tempReg, lengthReg, -3); + // we need to use 4 individual masks instead for countPositves() in LE + if (!isLE || !isCountPositives) + { + // we want to load 0x80808080 in to maskReg, but lis was designed for signed values, + // and would throw an error for 0x8080, yet it could accept the equivalent negative value of it; + // we don't worry about sign extension since the upper word should be 0 in storeReg after lwzx + generateTrg1ImmInstruction(cg, TR::InstOpCode::lis, node, maskReg, -32640); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::ori, node, maskReg, maskReg, 0x8080); + } + + generateLabelInstruction(cg, TR::InstOpCode::label, node, serialUnrollLabel); + generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, tempReg); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, serialLabel, cr6); + // loading 4 bytes at once is slightly faster + generateTrg1MemInstruction(cg, TR::InstOpCode::lwzx, node, storeReg, + TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 4)); + + if (isCountPositives) // when counting positives, we must consider every byte separately + { + if (isLE) + { + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, maskReg, storeReg, 0x80); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, maskReg, storeReg, 0x8000); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, maskReg, storeReg, 0x80); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, maskReg, storeReg, 0x8000); + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1); + } + else + { + generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg); + generateTrg1Src1Instruction(cg, TR::InstOpCode::cntlzw, node, storeReg, storeReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, storeReg, storeReg, 3); + generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, indexReg, storeReg, indexReg); + // 4 means we need to keep checking + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, indexReg, 4); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6); + } + } + else + { + generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, storeReg, 0); + // this label happens to work + generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, vecResultLabel, cr6); + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 4); + } + generateLabelInstruction(cg, TR::InstOpCode::b, node, serialUnrollLabel); + + generateLabelInstruction(cg, TR::InstOpCode::label, node, serialLabel); + generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, lengthReg); + // if we reach the end, indexReg is len already, so we don't need to do anything for countPositives + if (isCountPositives) + generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, endLabel, cr6); + else + generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, resultLabel, cr6); + + generateTrg1MemInstruction(cg, TR::InstOpCode::lbzx, node, tempReg, + TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 1)); + generateTrg1Src1Instruction(cg, TR::InstOpCode::extsb, node, tempReg, tempReg); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, tempReg, 0); + // when seeking negatives, we need to return 1 + if (!isCountPositives) + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1); + generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6); + + generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1); + generateLabelInstruction(cg, TR::InstOpCode::b, node, serialLabel); + + // --- load the length for countPositves; load 0 for hasNegative + generateLabelInstruction(cg, TR::InstOpCode::label, node, resultLabel); + if (isCountPositives) + generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, indexReg, lengthReg); + else + generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 0); + // end + + TR::RegisterDependencyConditions *deps = + new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 11, cg->trMemory()); + + deps->addPostCondition(startReg, TR::RealRegister::NoReg); + deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0(); + + deps->addPostCondition(indexReg, TR::RealRegister::NoReg); + deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0(); + + deps->addPostCondition(lengthReg, TR::RealRegister::NoReg); + deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0(); + + deps->addPostCondition(tempReg, TR::RealRegister::NoReg); + + deps->addPostCondition(cr6, TR::RealRegister::cr6); + + if (isCountPositives && isLE) + deps->addPostCondition(cr0, TR::RealRegister::cr0); + deps->addPostCondition(vconstant0Reg, TR::RealRegister::NoReg); + deps->addPostCondition(vtmp1Reg, TR::RealRegister::NoReg); + deps->addPostCondition(vtmp2Reg, TR::RealRegister::NoReg); + + deps->addPostCondition(storeReg, TR::RealRegister::NoReg); + deps->addPostCondition(maskReg, TR::RealRegister::NoReg); + + generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps); + + if (isCountPositives) // if countPositives, indexReg will contain the first negative value + { + node->setRegister(indexReg); + cg->stopUsingRegister(tempReg); + } + else // if hasNegative, we will have a tempReg ready with zero or one + { + node->setRegister(tempReg); + cg->stopUsingRegister(indexReg); + } + + cg->stopUsingRegister(startReg); + cg->stopUsingRegister(lengthReg); + cg->stopUsingRegister(cr6); + cg->stopUsingRegister(cr0); + cg->stopUsingRegister(vconstant0Reg); + cg->stopUsingRegister(vtmp1Reg); + cg->stopUsingRegister(vtmp2Reg); + + cg->stopUsingRegister(storeReg); + cg->stopUsingRegister(maskReg); + + for (int32_t i = 0; i < node->getNumChildren(); i++) + { + cg->decReferenceCount(node->getChild(i)); + } + + if (isCountPositives) // if countPositives, indexReg will contain the first negative value + return indexReg; + return tempReg; // if hasNegative, we will have a tempReg ready with zero or one + } + /* * Arraycopy evaluator needs a version of inlineArrayCopy that can be used inside internal control flow. For this version of inlineArrayCopy, registers must * be allocated outside of this function so the dependency at the end of the control flow knows about them. @@ -12261,6 +12532,21 @@ J9::Power::CodeGenerator::inlineDirectCall(TR::Node *node, TR::Register *&result } break; + case TR::java_lang_StringCoding_hasNegatives: + if (cg->getSupportsInlineStringCodingHasNegatives()) + { + resultReg = inlineStringCodingHasNegativesOrCountPositives(node, cg, false); + return true; + } + break; + case TR::java_lang_StringCoding_countPositives: + if (cg->getSupportsInlineStringCodingCountPositives()) + { + resultReg = inlineStringCodingHasNegativesOrCountPositives(node, cg, true); + return true; + } + break; + case TR::sun_misc_Unsafe_compareAndSwapInt_jlObjectJII_Z: // In Java9 this can be either the jdk.internal JNI method or the sun.misc Java wrapper. // In Java8 it will be sun.misc which will contain the JNI directly.