Skip to content

Fast Path StringCoding.countPostives and hasNegative for Power #21597

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 5 commits into
base: master
Choose a base branch
from
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions runtime/compiler/p/codegen/J9CodeGenerator.cpp
Original file line number Diff line number Diff line change
@@ -133,6 +133,20 @@ J9::Power::CodeGenerator::initialize()
cg->setEnableTLHPrefetching();
}

static bool disableInlineStringCodingHasNegatives =
feGetEnv("TR_DisableInlineStringCodingHasNegatives") != NULL;
static bool disableInlineStringCodingCountPositives =
feGetEnv("TR_DisableInlineStringCodingCountPositives") != NULL;
if (comp->target().cpu.isAtLeast(OMR_PROCESSOR_PPC_P8) &&
comp->target().cpu.supportsFeature(OMR_FEATURE_PPC_HAS_VSX) &&
!TR::Compiler->om.canGenerateArraylets())
{
if (!disableInlineStringCodingHasNegatives)
cg->setSupportsInlineStringCodingHasNegatives();
if (!disableInlineStringCodingCountPositives)
cg->setSupportsInlineStringCodingCountPositives();
}

//This env-var does 3 things:
// 1. Prevents batch clear in frontend/j9/rossa.cpp
// 2. Prevents all allocations to nonZeroTLH
286 changes: 286 additions & 0 deletions runtime/compiler/p/codegen/J9TreeEvaluator.cpp
Original file line number Diff line number Diff line change
@@ -11693,6 +11693,277 @@ static bool inlineIntrinsicInflate(TR::Node *node, TR::CodeGenerator *cg)
return true;
}

static TR::Register *inlineStringCodingHasNegativesOrCountPositives(TR::Node *node,
TR::CodeGenerator *cg,
bool isCountPositives)
{
TR::Compilation *comp = cg->comp();
bool isLE = comp->target().cpu.isLittleEndian();

TR::Register *startReg = cg->gprClobberEvaluate(node->getChild(0)); // array
TR::Register *indexReg = cg->gprClobberEvaluate(node->getChild(1)); // offset
TR::Register *lengthReg = cg->evaluate(node->getChild(2)); // length

TR::Register *tempReg = cg->allocateRegister();

TR::Register *cr6 = cg->allocateRegister(TR_CCR);
TR::Register *cr0 = NULL;
if (isCountPositives && isLE)
cr0 = cg->allocateRegister(TR_CCR);

TR::Register *vconstant0Reg = cg->allocateRegister(TR_VRF);
TR::Register *vtmp1Reg = cg->allocateRegister(TR_VRF);
TR::Register *vtmp2Reg = cg->allocateRegister(TR_VRF);

TR::Register *storeReg = cg->allocateRegister();
TR::Register *maskReg = cg->allocateRegister();

TR::LabelSymbol *VSXLabel = generateLabelSymbol(cg);
TR::LabelSymbol *serialPrepLabel = generateLabelSymbol(cg);
TR::LabelSymbol *serialUnrollLabel = generateLabelSymbol(cg);
TR::LabelSymbol *serialLabel = generateLabelSymbol(cg);
TR::LabelSymbol *vecResultLabel = generateLabelSymbol(cg);
TR::LabelSymbol *resultLabel = generateLabelSymbol(cg);
TR::LabelSymbol *endLabel = generateLabelSymbol(cg);

// check empty
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, lengthReg, 0);
generateConditionalBranchInstruction(cg, TR::InstOpCode::ble, node, resultLabel, cr6);

// skip over or load the header
#if defined(J9VM_GC_SPARSE_HEAP_ALLOCATION)
if (TR::Compiler->om.isOffHeapAllocationEnabled())
{
generateTrg1MemInstruction(
cg, TR::InstOpCode::ld, node, startReg,
TR::MemoryReference::createWithDisplacement(
cg, startReg, TR::Compiler->om.offsetOfContiguousDataAddrField(), 8)
);
}
else
#endif /* J9VM_GC_SPARSE_HEAP_ALLOCATION */
{
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, startReg, startReg,
TR::Compiler->om.contiguousArrayHeaderSizeInBytes());
}

// get the starting address
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, startReg, startReg, indexReg);
// make the index 0 since everything we need is relative to the offset
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, indexReg, 0);

// check the first byte
generateTrg1MemInstruction(cg, TR::InstOpCode::lbzx, node, tempReg,
TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 1));
generateTrg1Src1Instruction(cg, TR::InstOpCode::extsb, node, tempReg, tempReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, tempReg, 0);
// when seeking negatives, we need to return 1
if (!isCountPositives)
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1);
generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6);
// if we only have one byte end it here, and return 0 for hasNegative
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, lengthReg);
if (!isCountPositives)
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 0);
generateConditionalBranchInstruction(cg, TR::InstOpCode::beq, node, endLabel, cr6);

// ready the zero reg
generateTrg1Src2Instruction(cg, TR::InstOpCode::vxor, node, vconstant0Reg, vconstant0Reg, vconstant0Reg);
// tempReg marks the end where we could use lxv
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, tempReg, lengthReg, -15);

// --- start of VSXLoop
generateLabelInstruction(cg, TR::InstOpCode::label, node, VSXLabel);
// go to residue if we don't have enough items to do one load
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, tempReg);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, serialPrepLabel, cr6);

// load 16 items
generateTrg1Src2Instruction(cg, TR::InstOpCode::lxvw4x, node, vtmp1Reg, startReg, indexReg);
if (isLE)
{
// swap around the shorts in each word; we need 2 instructions to load 16
generateTrg1ImmInstruction(cg, TR::InstOpCode::vspltisw, node, vtmp2Reg, 8);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vadduwm, node, vtmp2Reg, vtmp2Reg, vtmp2Reg);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vrlw, node, vtmp1Reg, vtmp1Reg, vtmp2Reg);
// then swap around the bytes in each short
generateTrg1ImmInstruction(cg, TR::InstOpCode::vspltish, node, vtmp2Reg, 8);
generateTrg1Src2Instruction(cg, TR::InstOpCode::vrlh, node, vtmp1Reg, vtmp1Reg, vtmp2Reg);
}
// bit 2 of cr6 (ZERO) will not be set if any comparison is true
generateTrg1Src2Instruction(cg, TR::InstOpCode::vcmpgtsb_r, node, vtmp1Reg, vconstant0Reg, vtmp1Reg);
// branch when the ZERO bit is not set
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, vecResultLabel, cr6);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 16);
generateLabelInstruction(cg, TR::InstOpCode::b, node, VSXLabel);

// --- this label is only used when we exit from the VSXLoop
generateLabelInstruction(cg, TR::InstOpCode::label, node, vecResultLabel);
if (isCountPositives) // jump to the serial label which sould soon count to the value we want
{
//generateTrg1Src1Instruction(cg, TR::InstOpCode::vclzlsbb, node, returnReg, vtmp1Reg);
//generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, returnReg, returnReg, indexReg);
generateLabelInstruction(cg, TR::InstOpCode::b, node, serialPrepLabel);
}
else // just report 1
{
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1);
generateLabelInstruction(cg, TR::InstOpCode::b, node, endLabel);
}

// --- serialPrepLabel to deal with whatever remains
generateLabelInstruction(cg, TR::InstOpCode::label, node, serialPrepLabel);
// do we have enough elements to use the unroll loop?
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, tempReg, lengthReg, -3);
// we need to use 4 individual masks instead for countPositves() in LE
if (!isLE || !isCountPositives)
{
// we want to load 0x80808080 in to maskReg, but lis was designed for signed values,
// and would throw an error for 0x8080, yet it could accept the equivalent negative value of it;
// we don't worry about sign extension since the upper word should be 0 in storeReg after lwzx
generateTrg1ImmInstruction(cg, TR::InstOpCode::lis, node, maskReg, -32640);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::ori, node, maskReg, maskReg, 0x8080);
}

generateLabelInstruction(cg, TR::InstOpCode::label, node, serialUnrollLabel);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, tempReg);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, serialLabel, cr6);
// loading 4 bytes at once is slightly faster
generateTrg1MemInstruction(cg, TR::InstOpCode::lwzx, node, storeReg,
TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 4));

if (isCountPositives) // when counting positives, we must consider every byte separately
{
if (isLE)
{
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, maskReg, storeReg, 0x80);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andi_r, node, maskReg, storeReg, 0x8000);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, maskReg, storeReg, 0x80);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::andis_r, node, maskReg, storeReg, 0x8000);
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, endLabel, cr0);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
}
else
{
generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg);
generateTrg1Src1Instruction(cg, TR::InstOpCode::cntlzw, node, storeReg, storeReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::srawi, node, storeReg, storeReg, 3);
generateTrg1Src2Instruction(cg, TR::InstOpCode::add, node, indexReg, storeReg, indexReg);
// 4 means we need to keep checking
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, indexReg, 4);
generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6);
}
}
else
{
generateTrg1Src2Instruction(cg, TR::InstOpCode::AND, node, storeReg, storeReg, maskReg);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, storeReg, 0);
// this label happens to work
generateConditionalBranchInstruction(cg, TR::InstOpCode::bne, node, vecResultLabel, cr6);
generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 4);
}
generateLabelInstruction(cg, TR::InstOpCode::b, node, serialUnrollLabel);

generateLabelInstruction(cg, TR::InstOpCode::label, node, serialLabel);
generateTrg1Src2Instruction(cg, TR::InstOpCode::cmp4, node, cr6, indexReg, lengthReg);
// if we reach the end, indexReg is len already, so we don't need to do anything for countPositives
if (isCountPositives)
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, endLabel, cr6);
else
generateConditionalBranchInstruction(cg, TR::InstOpCode::bge, node, resultLabel, cr6);

generateTrg1MemInstruction(cg, TR::InstOpCode::lbzx, node, tempReg,
TR::MemoryReference::createWithIndexReg(cg, startReg, indexReg, 1));
generateTrg1Src1Instruction(cg, TR::InstOpCode::extsb, node, tempReg, tempReg);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::cmpi4, node, cr6, tempReg, 0);
// when seeking negatives, we need to return 1
if (!isCountPositives)
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 1);
generateConditionalBranchInstruction(cg, TR::InstOpCode::blt, node, endLabel, cr6);

generateTrg1Src1ImmInstruction(cg, TR::InstOpCode::addi, node, indexReg, indexReg, 1);
generateLabelInstruction(cg, TR::InstOpCode::b, node, serialLabel);

// --- load the length for countPositves; load 0 for hasNegative
generateLabelInstruction(cg, TR::InstOpCode::label, node, resultLabel);
if (isCountPositives)
generateTrg1Src1Instruction(cg, TR::InstOpCode::mr, node, indexReg, lengthReg);
else
generateTrg1ImmInstruction(cg, TR::InstOpCode::li, node, tempReg, 0);
// end

TR::RegisterDependencyConditions *deps =
new (cg->trHeapMemory()) TR::RegisterDependencyConditions(0, 11, cg->trMemory());

deps->addPostCondition(startReg, TR::RealRegister::NoReg);
deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0();

deps->addPostCondition(indexReg, TR::RealRegister::NoReg);
deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0();

deps->addPostCondition(lengthReg, TR::RealRegister::NoReg);
deps->getPostConditions()->getRegisterDependency(deps->getAddCursorForPost() - 1)->setExcludeGPR0();

deps->addPostCondition(tempReg, TR::RealRegister::NoReg);

deps->addPostCondition(cr6, TR::RealRegister::cr6);

if (isCountPositives && isLE)
deps->addPostCondition(cr0, TR::RealRegister::cr0);
deps->addPostCondition(vconstant0Reg, TR::RealRegister::NoReg);
deps->addPostCondition(vtmp1Reg, TR::RealRegister::NoReg);
deps->addPostCondition(vtmp2Reg, TR::RealRegister::NoReg);

deps->addPostCondition(storeReg, TR::RealRegister::NoReg);
deps->addPostCondition(maskReg, TR::RealRegister::NoReg);

generateDepLabelInstruction(cg, TR::InstOpCode::label, node, endLabel, deps);

if (isCountPositives) // if countPositives, indexReg will contain the first negative value
{
node->setRegister(indexReg);
cg->stopUsingRegister(tempReg);
}
else // if hasNegative, we will have a tempReg ready with zero or one
{
node->setRegister(tempReg);
cg->stopUsingRegister(indexReg);
}

cg->stopUsingRegister(startReg);
cg->stopUsingRegister(lengthReg);
cg->stopUsingRegister(cr6);
cg->stopUsingRegister(cr0);
cg->stopUsingRegister(vconstant0Reg);
cg->stopUsingRegister(vtmp1Reg);
cg->stopUsingRegister(vtmp2Reg);

cg->stopUsingRegister(storeReg);
cg->stopUsingRegister(maskReg);

for (int32_t i = 0; i < node->getNumChildren(); i++)
{
cg->decReferenceCount(node->getChild(i));
}

if (isCountPositives) // if countPositives, indexReg will contain the first negative value
return indexReg;
return tempReg; // if hasNegative, we will have a tempReg ready with zero or one
}

/*
* Arraycopy evaluator needs a version of inlineArrayCopy that can be used inside internal control flow. For this version of inlineArrayCopy, registers must
* be allocated outside of this function so the dependency at the end of the control flow knows about them.
@@ -12261,6 +12532,21 @@ J9::Power::CodeGenerator::inlineDirectCall(TR::Node *node, TR::Register *&result
}
break;

case TR::java_lang_StringCoding_hasNegatives:
if (cg->getSupportsInlineStringCodingHasNegatives())
{
resultReg = inlineStringCodingHasNegativesOrCountPositives(node, cg, false);
return true;
}
break;
case TR::java_lang_StringCoding_countPositives:
if (cg->getSupportsInlineStringCodingCountPositives())
{
resultReg = inlineStringCodingHasNegativesOrCountPositives(node, cg, true);
return true;
}
break;

case TR::sun_misc_Unsafe_compareAndSwapInt_jlObjectJII_Z:
// In Java9 this can be either the jdk.internal JNI method or the sun.misc Java wrapper.
// In Java8 it will be sun.misc which will contain the JNI directly.